Figure 1

Figure 1C

#Load libraries.
library(skitools)
library(dplyr)

#Load identity calls per cell and patient.
combined_emb = read.table(file.path(params$fileLoc, "combined_emb_for_carc_for_homogenous_sikk_ep_500_nw.csv"),sep=",",header=TRUE)
query_samp = combined_emb[which(combined_emb$ref_or_query=="query"),]
cellcount_mat = query_samp[,c("X","Patient")]
aa = as.data.frame(table(cellcount_mat$Patient))
grtr_than_50 = aa[which(aa$Freq>50),]
query_samp_nw = query_samp[which(query_samp$Patient %in% grtr_than_50$Var1),]
query_samp_nw$ann_finest_lev_transferred_label_filtered = ifelse(query_samp_nw$ann_finest_lev_transfer_uncert<0.5,query_samp_nw$ann_finest_lev_transferred_label_unfiltered,"Unknown")
query_samp_nw = query_samp_nw[which(query_samp_nw$ann_finest_lev_transferred_label_filtered!="Unknown"),]

#Select patients to plot (LUAD is the variable name but it includes LUSC and LCNEC patients as well).
LUAD = c('LX681','UHL-7','WCM-3','UHL-3','LX666','WCM-2','LX701','LX679','WCM-4','UHL-5','UHL-8','LX661','LX699','LX676','UHL-1','UHL-4','UHL-6','WCM-1','PS05','PS06','PS09','PS03','PS04','PS10','UHL-2','LX680')
LUAD_query = query_samp_nw[which(query_samp_nw$Patient %in% LUAD),]
distal = c("AT0","AT1","AT2","AT2 proliferating")
rare = c("Ionocyte","Tuft","Neuroendocrine")
distal_rare = c(distal,rare)
epicells = c("AT1","AT2","AT2 proliferating","AT0","Suprabasal","Basal resting","Hillock-like","Multiciliated (non-nasal)","Multiciliated (nasal)","Deuterosomal","Neuroendocrine","Ionocyte","Tuft","Goblet (nasal)","Club (nasal)","Club (non-nasal)","pre-TB secretory","Goblet (bronchial)","Goblet (subsegmental)","SMG serous (bronchial)","SMG mucous","SMG duct","SMG serous (nasal)")
proximal = setdiff(epicells,distal_rare)
LUAD_query$cat = ifelse(LUAD_query$ann_finest_lev_transferred_label_filtered %in% distal,"distal",LUAD_query$ann_finest_lev_transferred_label_filtered)

#Group cells in categories for plotting.
LUAD_query$cat = ifelse(LUAD_query$cat=="rare",LUAD_query$ann_finest_lev_transferred_label_filtered,LUAD_query$cat)
AT1 = c("AT1")
AT2 = c("AT2","AT2 proliferating","AT0")
basal = c("Suprabasal","Basal resting","Hillock-like")
multicilated = c("Multiciliated (non-nasal)","Multiciliated (nasal)","Deuterosomal")
rare = c("Neuroendocrine","Ionocyte","Tuft")
secretory = c("Goblet (nasal)","Club (non-nasal)","Club (nasal)","pre-TB secretory","Goblet (bronchial)","Goblet (subsegmental)")
smg = c("SMG serous (bronchial)","SMG mucous","SMG duct","SMG serous (nasal)")
LUAD_query[LUAD_query$ann_finest_lev_transferred_label_filtered %in% AT1,"cat1"]="AT1"
LUAD_query[LUAD_query$ann_finest_lev_transferred_label_filtered %in% AT2,"cat1"]="AT2"
LUAD_query[LUAD_query$ann_finest_lev_transferred_label_filtered %in% basal,"cat1"]="Basal"
LUAD_query[LUAD_query$ann_finest_lev_transferred_label_filtered %in% multicilated,"cat1"]="multiciliated"
LUAD_query[LUAD_query$ann_finest_lev_transferred_label_filtered %in% secretory,"cat1"]="secretory"
LUAD_query[LUAD_query$ann_finest_lev_transferred_label_filtered %in% smg,"cat1"]="smg"
LUAD_query$cat1 = ifelse(LUAD_query$ann_finest_lev_transferred_label_filtered %in% rare,LUAD_query$cat,LUAD_query$cat1)

#Consolidate results into percentages per cell group per patient.
# Count the rows per group
# Calculate total counts per patient
 # Calculate percentage
dft2 <- LUAD_query %>% group_by(Patient, cat1) %>% summarise(count = n()) %>% mutate(total = sum(count), percentage = (count / total) * 100) %>% dplyr::select(-total)

#Set patient order and plot.
dft2$Patient <- factor(dft2$Patient, levels = c('LX681','UHL-7','WCM-3','UHL-3','LX666','WCM-2','LX701','LX679','WCM-4','UHL-5','UHL-8','LX661','LX699','LX676','UHL-1','UHL-4','UHL-6','WCM-1','PS05','PS06','PS09','PS03','PS04','PS10','UHL-2','LX680'))

p = ggplot(data=dft2, aes(x=Patient, y=percentage, fill=cat1)) + 
  geom_bar(stat="identity") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
  scale_fill_manual(values=c("AT1"= '#1F77B4',"AT2"="#FF7F0E","Basal"="#279E68",
                             "multiciliated"="#D62728","Neuroendocrine"="#AA0","Tuft" = "salmon","Ionocyte"="grey",
                             "secretory"="#8C564B","smg"="#E377C2"))
print(p)

Figure 2

Figure 2B

library(skitools)
library(MASS)
library(parallel)
res = readRDS(file.path(params$fileLoc, "Fig2B.rds"))

#Set estimates value for each celltype below fdr threshold (0.1) to null value of 1. This effectively discards said celltypes from COO calling. 
dt = dcast.data.table(res[grep('cov', name), ][, estimate2 := ifelse(fdr<0.1, estimate, 1)], tt ~ celltype, value.var = 'estimate2')

#Take the estimates and add back GLM information required for plotting (confidence interval values, celltype, fdr, etc.).
dt.plot = melt(dt, id.vars = c("tt"))
colnames(dt.plot)[2] = "celltype"
dt.plot = as.data.table(dt.plot)
dt.plot[, combination := paste0(tt,"_",celltype)]
tmp.res = res[grep('cov', name), ][,.(combination,tt,celltype,estimate,ci.lower,ci.upper,p,fdr,plot.label)]
colnames(tmp.res)[1] = "combination.formal"
tmp.res[, combination := paste0(tt,"_",celltype)]
tmp.res$tt = NULL
tmp.res$celltype = NULL
dt.plot = merge(dt.plot, tmp.res, by = "combination", all.x = TRUE)
dt.plot$tumor.type = "LUAD"
dt.plot$tumor.type = toupper(dt.plot$tumor.type)
dt.plot$combo.label = paste0(dt.plot$tumor.type,"_",dt.plot$celltype)
dt.plot$Cell_Type = dt.plot$celltype


#Center relative risk and confidence interval estimate values at 1 for plot.
dt.plot$estimate = dt.plot$estimate - 1
dt.plot$ci.lower = dt.plot$ci.lower - 1
dt.plot$ci.upper = dt.plot$ci.upper - 1

# Relative risk barplots. The code allows for filtering for specific cancer type, snv columns, and celltypes via in.pattern.tt, in.pattern.snv, and in.ct.sub respectively. Here we add the values needed for plotting all GLM outputs defined above with no restriction. Note that multiple cancer subtype grpahs can be done by adding more values to in.pattern.tt. 
in.pattern.tt = c('LUAD')
in.pattern.snv = "snv.count"
in.ct.sub = epicells
for (ik in in.pattern.tt)
  {
    message('\nPlotting mutational density for: ', ik, ' samples')
#Filter for any cancer type, celltype, or snv column if needed. 
    in.pattern = in.pattern.snv
    in.count.sub = grep(paste0(in.pattern),dt.plot$tt, value = TRUE)
    dt.plot.sub = dt.plot[dt.plot$tumor.type %in% ik & dt.plot$celltype %in% in.ct.sub & dt.plot$tt %in% in.count.sub,]
#Order results by relative risk (lower to higher).     
    setorder(dt.plot.sub,celltype)
    in.title = unique(gsub(" - .*","",dt.plot.sub$combination.formal))
# Set upper and lower limits for plot (x axis).
    in.axis.breaks = 0.05
    in.axis.min = -0.25
    in.axis.max = 10.05
#Construct ggplot2 object and plot.    
    p = ggplot(dt.plot.sub, aes(x = reorder(celltype,estimate), y = estimate)) +
      geom_bar(stat = "identity", alpha = 0.9) +
      scale_fill_manual(values = c("grey")) +
      geom_errorbar(aes(ymax = ci.upper, ymin = ci.lower), size = 0.85, width = 0.25, color = 'black') +
      scale_y_continuous(breaks = seq(in.axis.min-0.05, in.axis.max+0.05, in.axis.breaks), labels = function(y) y + 1) +
      theme_bw() +
      theme(panel.grid.minor = element_blank()) +
      xlab("") +
      ylab("Relative Risk") +
      ggtitle(paste0(in.title)) +
      theme(axis.text.y = element_text(size = 10, angle = 0, vjust = 0.5, hjust = 1)) +
      coord_flip()
    print(p)
  }

Figure 2C

library(skitools)
library(MASS)
library(parallel)
res = readRDS(file.path(params$fileLoc, "Fig2C.rds"))

#Now set estimates value for each celltype below fdr threshold (0.1) to null value of 1. This effectively discards said celltypes from COO calling. 
dt = dcast.data.table(res[grep('cov', name), ][, estimate2 := ifelse(fdr<0.1, estimate, 1)], tt ~ celltype, value.var = 'estimate2')

#Take the estimates and add back GLM information required for plotting (confidence interval values, celltype, fdr, etc.).
dt.plot = melt(dt, id.vars = c("tt"))
colnames(dt.plot)[2] = "celltype"
dt.plot = as.data.table(dt.plot)
dt.plot[, combination := paste0(tt,"_",celltype)]
tmp.res = res[grep('cov', name), ][,.(combination,tt,celltype,estimate,ci.lower,ci.upper,p,fdr,plot.label)]
colnames(tmp.res)[1] = "combination.formal"
tmp.res[, combination := paste0(tt,"_",celltype)]
tmp.res$tt = NULL
tmp.res$celltype = NULL
dt.plot = merge(dt.plot, tmp.res, by = "combination", all.x = TRUE)
dt.plot$tumor.type = "LUSC"
dt.plot$tumor.type = toupper(dt.plot$tumor.type)
dt.plot$combo.label = paste0(dt.plot$tumor.type,"_",dt.plot$celltype)
dt.plot$Cell_Type = dt.plot$celltype

#Center relative risk and confidence interval estimate values at 1 for plot.
dt.plot$estimate = dt.plot$estimate - 1
dt.plot$ci.lower = dt.plot$ci.lower - 1
dt.plot$ci.upper = dt.plot$ci.upper - 1

# Relative risk barplots. The code allows for filtering for specific cancer type, snv columns, and celltypes via in.pattern.tt, in.pattern.snv, and in.ct.sub respectively. Here we add the values needed for plotting all GLM outputs defined above with no restriction. Note that multiple cancer subtype grpahs can be done by adding more values to in.pattern.tt. 
in.pattern.tt = c('LUSC')
in.pattern.snv = "snv.count"
for (ik in in.pattern.tt)
  {
    message('\nPlotting mutational density for: ', ik, ' samples')
#Filter for any cancer type, celltype, or snv column if needed.     
    in.ct.sub = epicells
    in.pattern = in.pattern.snv
    in.count.sub = grep(paste0(in.pattern),dt.plot$tt, value = TRUE)
    dt.plot.sub = dt.plot[dt.plot$tumor.type %in% ik & dt.plot$celltype %in% in.ct.sub & dt.plot$tt %in% in.count.sub,]
#Order results by relative risk (lower to higher).         
    setorder(dt.plot.sub,celltype)
    in.title = unique(gsub(" - .*","",dt.plot.sub$combination.formal))
# Set upper and lower limits for plot (x axis).
    in.axis.breaks = 0.05
    in.axis.min = -0.25
    in.axis.max = 10.05
#Construct ggplot2 object and plot.        
    p = ggplot(dt.plot.sub, aes(x = reorder(celltype,estimate), y = estimate)) +
      geom_bar(stat = "identity", alpha = 0.9) +
      scale_fill_manual(values = c("grey")) +
      geom_errorbar(aes(ymax = ci.upper, ymin = ci.lower), size = 0.85, width = 0.25, color = 'black') +
      #scale_y_continuous(breaks = seq(in.axis.min-0.05, in.axis.max+0.05, in.axis.breaks), labels = function(y) y + 1, limits = c(in.axis.min, in.axis.max)) +
      scale_y_continuous(breaks = seq(in.axis.min-0.05, in.axis.max+0.05, in.axis.breaks), labels = function(y) y + 1) +
      theme_bw() +
      theme(panel.grid.minor = element_blank()) +
      xlab("") +
      ylab("Relative Risk") +
      ggtitle(paste0(in.title)) +
      theme(axis.text.y = element_text(size = 10, angle = 0, vjust = 0.5, hjust = 1)) +
      coord_flip()
    print(p)
    }

Figure 3

Figure 3A

library(skitools)
library(data.table)
library(skitools)
library(skidb)
library(plyr)
library(dplyr)
library(MASS)
library(wesanderson)
library(readxl)
library(ggalluvial)
library(wesanderson)
library(readxl)
#library(ggsankey)
library(effects)
library(stats)
library(forcats) 
library(ggpubr)
library(ggforce)

mat.in.2 <- readRDS(file.path(params$fileLoc,'mat.in.2.rds'))
col_fun2 <- readRDS(file.path(params$fileLoc,'col_fun2.rds'))

column_ha3_1 = HeatmapAnnotation(
  Tobacco = anno_barplot(mat.in.2[mat.in.2$cluster_sk_non_apobec == 1,29], ylim = c(0, 200000), gp = gpar(col = "red", fill = "#FF0000") , axis = FALSE),
  ID3 = anno_barplot(mat.in.2[mat.in.2$cluster_sk_non_apobec == 1,32], ylim = c(0, 10000), gp = gpar(col = "#9900CC", fill = "#9900CC") , axis = FALSE) ,
  
  SBS1 = anno_barplot( mat.in.2[mat.in.2$cluster_sk_non_apobec == 1,25], ylim = c(0, 3000), gp = gpar(col = "darkgreen", fill = "darkgreen") , axis = FALSE) ,
  SBS5 = anno_barplot(mat.in.2[mat.in.2$cluster_sk_non_apobec == 1,26], ylim = c(0, 60000), gp = gpar(col = "darkgreen", fill = "darkgreen") , axis = FALSE) ,
  
  Apobec = anno_barplot(mat.in.2[mat.in.2$cluster_sk_non_apobec == 1,27], ylim = c(0, 40000), gp = gpar(col = "orange", fill = "#FF8000") , axis = FALSE) ,
  
  ID1 = anno_barplot(mat.in.2[mat.in.2$cluster_sk_non_apobec == 1,30], ylim = c(0, 4000), gp = gpar(col = "#9900CC", fill = "#9900CC") , axis = FALSE) ,
  ID2 = anno_barplot(mat.in.2[mat.in.2$cluster_sk_non_apobec == 1,31], ylim = c(0, 7000), gp = gpar(col = "#9900CC", fill = "#9900CC") , axis = FALSE) ,
  ID12 = anno_barplot(mat.in.2[mat.in.2$cluster_sk_non_apobec == 1,33], ylim = c(0, 3200), gp = gpar(col = "#9900CC", fill = "#9900CC") , axis = FALSE) ,
  
  purity = anno_barplot(mat.in.2[mat.in.2$cluster_sk_non_apobec == 1,38], ylim = c(0, 1), gp = gpar(col = "green", fill = "green") , axis = FALSE),
  TMB =  anno_barplot(mat.in.2[mat.in.2$cluster_sk_non_apobec == 1,37], ylim = c(0, 100), gp = gpar(col = "brown", fill = "brown") , axis = FALSE),
  Smoking_Status = mat.in.2[mat.in.2$cluster_sk_non_apobec == 1,34],
  
  col = list(
    Smoking_Status = c("Smoker" = "orange", "Never Smoker" = "blue") #,
  ),
  show_annotation_name = FALSE
)


column_ha3_2 = HeatmapAnnotation(
  Tobacco = anno_barplot(mat.in.2[mat.in.2$cluster_sk_non_apobec == 2,29], ylim = c(0, 200000), gp = gpar(col = "red", fill = "#FF0000") ),
  ID3 = anno_barplot(mat.in.2[mat.in.2$cluster_sk_non_apobec == 2,32], ylim = c(0, 10000), gp = gpar(col = "#9900CC", fill = "#9900CC") ) ,
  
  SBS1 = anno_barplot( mat.in.2[mat.in.2$cluster_sk_non_apobec == 2,25], ylim = c(0, 3000), gp = gpar(col = "darkgreen", fill = "darkgreen") ) ,
  SBS5 = anno_barplot(mat.in.2[mat.in.2$cluster_sk_non_apobec == 2,26], ylim = c(0, 60000), gp = gpar(col = "darkgreen", fill = "darkgreen") ) ,
  
  Apobec = anno_barplot(mat.in.2[mat.in.2$cluster_sk_non_apobec == 2,27], ylim = c(0, 40000), gp = gpar(col = "orange", fill = "#FF8000") ) ,
  
  ID1 = anno_barplot(mat.in.2[mat.in.2$cluster_sk_non_apobec == 2,30], ylim = c(0, 4000), gp = gpar(col = "#9900CC", fill = "#9900CC") ) ,
  ID2 = anno_barplot(mat.in.2[mat.in.2$cluster_sk_non_apobec == 2,31], ylim = c(0, 7000), gp = gpar(col = "#9900CC", fill = "#9900CC") ) ,
  ID12 = anno_barplot(mat.in.2[mat.in.2$cluster_sk_non_apobec == 2,33], ylim = c(0, 3200), gp = gpar(col = "#9900CC", fill = "#9900CC") ) ,
  
  purity = anno_barplot(mat.in.2[mat.in.2$cluster_sk_non_apobec == 2,38], ylim = c(0, 1), gp = gpar(col = "green", fill = "green") ),
  TMB =  anno_barplot(mat.in.2[mat.in.2$cluster_sk_non_apobec == 2,37], ylim = c(0, 100), gp = gpar(col = "brown", fill = "brown") ),
  Smoking_Status = mat.in.2[mat.in.2$cluster_sk_non_apobec == 2,34],
  
  col = list(
    Smoking_Status = c("Smoker" = "orange", "Never Smoker" = "blue") #,
  ),
  show_annotation_name = FALSE
)


column_ha3_3 = HeatmapAnnotation(
  Tobacco = anno_barplot(mat.in.2[mat.in.2$cluster_sk_non_apobec == 3,29], ylim = c(0, 200000), gp = gpar(col = "red", fill = "#FF0000") , axis = FALSE),
  ID3 = anno_barplot(mat.in.2[mat.in.2$cluster_sk_non_apobec == 3,32], ylim = c(0, 10000), gp = gpar(col = "#9900CC", fill = "#9900CC") , axis = FALSE) ,
  
  SBS1 = anno_barplot( mat.in.2[mat.in.2$cluster_sk_non_apobec == 3,25], ylim = c(0, 3000), gp = gpar(col = "darkgreen", fill = "darkgreen") , axis = FALSE) ,
  SBS5 = anno_barplot(mat.in.2[mat.in.2$cluster_sk_non_apobec == 3,26], ylim = c(0, 60000), gp = gpar(col = "darkgreen", fill = "darkgreen") , axis = FALSE) ,
  
  Apobec = anno_barplot(mat.in.2[mat.in.2$cluster_sk_non_apobec == 3,27], ylim = c(0, 40000), gp = gpar(col = "orange", fill = "#FF8000") , axis = FALSE) ,
  
  ID1 = anno_barplot(mat.in.2[mat.in.2$cluster_sk_non_apobec == 3,30], ylim = c(0, 4000), gp = gpar(col = "#9900CC", fill = "#9900CC") , axis = FALSE) ,
  ID2 = anno_barplot(mat.in.2[mat.in.2$cluster_sk_non_apobec == 3,31], ylim = c(0, 7000), gp = gpar(col = "#9900CC", fill = "#9900CC") , axis = FALSE) ,
  ID12 = anno_barplot(mat.in.2[mat.in.2$cluster_sk_non_apobec == 3,33], ylim = c(0, 3200), gp = gpar(col = "#9900CC", fill = "#9900CC") , axis = FALSE) ,
  
  purity = anno_barplot(mat.in.2[mat.in.2$cluster_sk_non_apobec == 3,38], ylim = c(0, 1), gp = gpar(col = "green", fill = "green") , axis = FALSE),
  TMB =  anno_barplot(mat.in.2[mat.in.2$cluster_sk_non_apobec == 3,37], ylim = c(0, 100), gp = gpar(col = "brown", fill = "brown") , axis = FALSE),
  Smoking_Status = mat.in.2[mat.in.2$cluster_sk_non_apobec == 3,34],
  
  col = list(
    Smoking_Status = c("Smoker" = "orange", "Never Smoker" = "blue") #,
  ),
  show_annotation_name = FALSE
)

column_ha3_4 = HeatmapAnnotation(
  #snv = anno_barplot(mat.in.2[,26], gp = gpar(col = "black", fill = "#4575B4")),
  Tobacco = anno_barplot(mat.in.2[mat.in.2$cluster_sk_non_apobec == 4,29], ylim = c(0, 200000), gp = gpar(col = "red", fill = "#FF0000") , axis = FALSE),
  ID3 = anno_barplot(mat.in.2[mat.in.2$cluster_sk_non_apobec == 4,32], ylim = c(0, 10000), gp = gpar(col = "#9900CC", fill = "#9900CC") , axis = FALSE) ,
  
  SBS1 = anno_barplot( mat.in.2[mat.in.2$cluster_sk_non_apobec == 4,25], ylim = c(0, 3000), gp = gpar(col = "darkgreen", fill = "darkgreen") , axis = FALSE) ,
  SBS5 = anno_barplot(mat.in.2[mat.in.2$cluster_sk_non_apobec == 4,26], ylim = c(0, 60000), gp = gpar(col = "darkgreen", fill = "darkgreen") , axis = FALSE) ,
  
  Apobec = anno_barplot(mat.in.2[mat.in.2$cluster_sk_non_apobec == 4,27], ylim = c(0, 40000), gp = gpar(col = "orange", fill = "#FF8000") , axis = FALSE) ,
  
  ID1 = anno_barplot(mat.in.2[mat.in.2$cluster_sk_non_apobec == 4,30], ylim = c(0, 4000), gp = gpar(col = "#9900CC", fill = "#9900CC") , axis = FALSE) ,
  ID2 = anno_barplot(mat.in.2[mat.in.2$cluster_sk_non_apobec == 4,31], ylim = c(0, 7000), gp = gpar(col = "#9900CC", fill = "#9900CC") , axis = FALSE) ,
  ID12 = anno_barplot(mat.in.2[mat.in.2$cluster_sk_non_apobec == 4,33], ylim = c(0, 3200), gp = gpar(col = "#9900CC", fill = "#9900CC") , axis = FALSE) ,
  
  purity = anno_barplot(mat.in.2[mat.in.2$cluster_sk_non_apobec == 4,38], ylim = c(0, 1), gp = gpar(col = "green", fill = "green") , axis = FALSE),
  TMB =  anno_barplot(mat.in.2[mat.in.2$cluster_sk_non_apobec == 4,37], ylim = c(0, 100), gp = gpar(col = "brown", fill = "brown") , axis = FALSE),
  Smoking_Status = mat.in.2[mat.in.2$cluster_sk_non_apobec == 4,34],
  
  col = list(
    Smoking_Status = c("Smoker" = "orange", "Never Smoker" = "blue") #,
  ),
  show_annotation_name = TRUE
)


set.seed(90210)
ht1 <- Heatmap(t(mat.in.2[mat.in.2$cluster_sk_non_apobec == 1,2:24]), name = "HT1 Relative Risk", col = col_fun2, cluster_rows = TRUE, cluster_columns = TRUE, row_names_gp = gpar(fontsize = 15), column_names_gp = gpar(fontsize = 10), 
               column_names_side = c("bottom"), show_column_names = FALSE, column_km = 1, column_km_repeats = 100, top_annotation = column_ha3_1,
               show_parent_dend_line = FALSE, column_gap = unit(c(4), "mm"), column_title = NULL, border = TRUE)

set.seed(90210)
ht2 <- Heatmap(t(mat.in.2[mat.in.2$cluster_sk_non_apobec == 2,2:24]), name = "HT2 Relative Risk", col = col_fun2, cluster_rows = TRUE, cluster_columns = TRUE, row_names_gp = gpar(fontsize = 15), column_names_gp = gpar(fontsize = 10), 
               column_names_side = c("bottom"), show_column_names = FALSE, column_km = 1, column_km_repeats = 100, top_annotation = column_ha3_2,
               show_parent_dend_line = FALSE, column_gap = unit(c(4), "mm"), column_title = NULL, border = TRUE)

set.seed(90210)
ht3 <- Heatmap(t(mat.in.2[mat.in.2$cluster_sk_non_apobec == 3,2:24]), name = "HT3 Relative Risk", col = col_fun2, cluster_rows = TRUE, cluster_columns = TRUE, row_names_gp = gpar(fontsize = 15), column_names_gp = gpar(fontsize = 10), 
               column_names_side = c("bottom"), show_column_names = FALSE, column_km = 1, column_km_repeats = 100, top_annotation = column_ha3_3,
               show_parent_dend_line = FALSE, column_gap = unit(c(4), "mm"), column_title = NULL, border = TRUE)

set.seed(90210)
ht4 <- Heatmap(t(mat.in.2[mat.in.2$cluster_sk_non_apobec == 4,2:24]), name = "HT4 Relative Risk", col = col_fun2, cluster_rows = TRUE, cluster_columns = TRUE, row_names_gp = gpar(fontsize = 15), column_names_gp = gpar(fontsize = 10), 
               column_names_side = c("bottom"), show_column_names = FALSE, column_km = 1, column_km_repeats = 100, top_annotation = column_ha3_4,
               show_parent_dend_line = FALSE, column_gap = unit(c(4), "mm"), column_title = NULL, border = TRUE)


ht_list =  ht2+ ht1 + ht3 + ht4 
draw(ht_list, row_title = "Heatmap list", column_title = "Heatmap list")

Figure 3B

# Distal Lung
fig3B_data <- readRDS(file.path(params$fileLoc,'fig3B_data.rds'))
ggplot(fig3B_data[cluster == 'Distal'], aes(x = reorder(celltype,estimate), y = estimate, fill = Cell_Class)) +
  geom_bar(stat = "identity", alpha = 0.9) +
  scale_fill_manual(values = c("grey")) +
  geom_errorbar(aes(ymax = ci.upper, ymin = ci.lower), size = 0.65, width = 0.25, color = 'black') +
  scale_y_continuous(breaks = seq(in.axis.min-0.05, in.axis.max+0.05, in.axis.breaks), labels = function(y) y + 1, limits = c(-0.25, 0.35)) + # LUAD
  scale_fill_manual("legend", values = c("Proximal" = "olivedrab3", "Distal" = "darkgoldenrod3")) +
  theme_bw() +
  theme(panel.grid.minor = element_blank()) +
  xlab("") +
  ylab("Relative Risk") +
  ggtitle(paste0(in.title, 'LUAD - Cluster 1')) +
  theme(axis.text.y = element_text(size = 10, angle = 0, vjust = 0.5, hjust = 1)) + guides(fill=guide_legend(title="Cell types")) +
  coord_flip()

# Ambiguous
ggplot(fig3B_data[cluster == 'Ambiguous'], aes(x = reorder(celltype,estimate), y = estimate, fill = Cell_Class)) +
  geom_bar(stat = "identity", alpha = 0.9) +
  scale_fill_manual(values = c("grey")) +
  geom_errorbar(aes(ymax = ci.upper, ymin = ci.lower), size = 0.65, width = 0.25, color = 'black') +
  scale_y_continuous(breaks = seq(in.axis.min-0.05, in.axis.max+0.05, in.axis.breaks), labels = function(y) y + 1, limits = c(-0.25, 0.35)) + # LUAD
  scale_fill_manual("legend", values = c("Proximal" = "olivedrab3", "Distal" = "darkgoldenrod3")) +
  theme_bw() +
  theme(panel.grid.minor = element_blank()) +
  xlab("") +
  ylab("Relative Risk") +
  ggtitle(paste0(in.title, 'LUAD - Cluster 2')) +
  theme(axis.text.y = element_text(size = 10, angle = 0, vjust = 0.5, hjust = 1)) + guides(fill=guide_legend(title="Cell types")) +
  coord_flip()

# Proximal_1
ggplot(fig3B_data[cluster == 'Proximal_1'], aes(x = reorder(celltype,estimate), y = estimate, fill = Cell_Class)) +
  geom_bar(stat = "identity", alpha = 0.9) +
  scale_fill_manual(values = c("grey")) +
  geom_errorbar(aes(ymax = ci.upper, ymin = ci.lower), size = 0.65, width = 0.25, color = 'black') +
  scale_y_continuous(breaks = seq(in.axis.min-0.05, in.axis.max+0.05, in.axis.breaks), labels = function(y) y + 1, limits = c(-0.25, 0.35)) + # LUAD
  scale_fill_manual("legend", values = c("Proximal" = "olivedrab3", "Distal" = "darkgoldenrod3")) +
  theme_bw() +
  theme(panel.grid.minor = element_blank()) +
  xlab("") +
  ylab("Relative Risk") +
  ggtitle(paste0(in.title, 'LUAD - Cluster 3')) +
  theme(axis.text.y = element_text(size = 10, angle = 0, vjust = 0.5, hjust = 1)) + guides(fill=guide_legend(title="Cell types")) +
  coord_flip()

# Proximal_2
ggplot(fig3B_data[cluster == 'Proximal_2'], aes(x = reorder(celltype,estimate), y = estimate, fill = Cell_Class)) +
  geom_bar(stat = "identity", alpha = 0.9) +
  scale_fill_manual(values = c("grey")) +
  geom_errorbar(aes(ymax = ci.upper, ymin = ci.lower), size = 0.65, width = 0.25, color = 'black') +
  scale_y_continuous(breaks = seq(in.axis.min-0.05, in.axis.max+0.05, in.axis.breaks), labels = function(y) y + 1, limits = c(-0.25, 0.35)) + # LUAD
  scale_fill_manual("legend", values = c("Proximal" = "olivedrab3", "Distal" = "darkgoldenrod3")) +
  theme_bw() +
  theme(panel.grid.minor = element_blank()) +
  xlab("") +
  ylab("Relative Risk") +
  ggtitle(paste0(in.title, 'LUAD - Cluster 4')) +
  theme(axis.text.y = element_text(size = 10, angle = 0, vjust = 0.5, hjust = 1)) + guides(fill=guide_legend(title="Cell types")) +
  coord_flip()

Figure 3C

# tmb
extd_data_3 <- fread(file.path(params$fileLoc,'extd_data_3.csv'))
ik = 'tmb_75_pct'
freq.plot.tmp.f = extd_data_3 
freq.plot.tmp.f$clust.int = freq.plot.tmp.f$CellOfOrigin
freq.plot.tmp.f = freq.plot.tmp.f[!duplicated(freq.plot.tmp.f),]
col.num = which(colnames(freq.plot.tmp.f) == ik)
freq.plot.tmp.f$mut.int = freq.plot.tmp.f[,..col.num]
freq.plot.tmp.f = freq.plot.tmp.f[!is.na(mut.int),]
freq.plot.tmp.f$mut.int = as.numeric(freq.plot.tmp.f$mut.int)
res.plot = freq.plot.tmp.f[, prop.test(sum(mut.int), .N) %>% dflm %>% cbind(nmut = sum(mut.int), tot = .N), by = .(clust.int)][, fracmut := estimate]
res.plot$clust.int = factor(res.plot$clust.int, levels = c('Proximal','Distal', 'Ambiguous'))

ggplot(res.plot, aes(x = clust.int, y = fracmut, fill = clust.int)) +
  geom_bar(stat = 'identity', position = position_dodge()) +
  geom_errorbar(aes(ymin = ci.lower, ymax = ci.upper), width = 0.15) + theme_bw() + 
  scale_fill_manual(values = c(  'olivedrab3', 'darkgoldenrod3', 'darkgrey')) +
  labs(title = paste0(ik), x = '', y = '') + theme_classic() +
  theme(plot.title = element_text(size = 20, face = 'bold'),
        axis.text.x = element_text(size = 0, angle = 0, hjust = 0.5),
        axis.text.y = element_text(size = 22, angle = 0, hjust = 1),
        axis.title.x = element_text(size = 0, face = 'plain'),
        axis.title.y = element_text(size = 5, face = 'bold'),
        axis.ticks.x = element_blank()) + 
  geom_text(mapping = aes(x = clust.int, y = ci.upper + 0.05, label = paste0(nmut, '/', tot)), size = 7) +
  guides(fill = guide_legend(title = '')) + theme(legend.position = "top")

# Tobacco  
ik = 'Tobacco_75pct'
freq.plot.tmp.f = extd_data_3 
freq.plot.tmp.f$clust.int = freq.plot.tmp.f$CellOfOrigin
freq.plot.tmp.f = freq.plot.tmp.f[!duplicated(freq.plot.tmp.f),]
col.num = which(colnames(freq.plot.tmp.f) == ik)
freq.plot.tmp.f$mut.int = freq.plot.tmp.f[,..col.num]
freq.plot.tmp.f = freq.plot.tmp.f[!is.na(mut.int),]
freq.plot.tmp.f$mut.int = as.numeric(freq.plot.tmp.f$mut.int)
res.plot = freq.plot.tmp.f[, prop.test(sum(mut.int), .N) %>% dflm %>% cbind(nmut = sum(mut.int), tot = .N), by = .(clust.int)][, fracmut := estimate]
res.plot$clust.int = factor(res.plot$clust.int, levels = c('Proximal','Distal', 'Ambiguous'))
prox.mut = res.plot[clust.int == 'Proximal',]$nmut
prox.wt = res.plot[clust.int == 'Proximal',]$tot - res.plot[clust.int == 'Proximal',]$nmut
dist.mut = res.plot[clust.int == 'Distal',]$nmut
dist.wt = res.plot[clust.int == 'Distal',]$tot - res.plot[clust.int == 'Distal',]$nmut
prox.dist.fisher = matrix(c(prox.mut, dist.mut, prox.wt, dist.wt), nrow = 2, byrow = TRUE) %>% fisher.test %>% dflm

ggplot(res.plot, aes(x = clust.int, y = fracmut, fill = clust.int)) +
  geom_bar(stat = 'identity', position = position_dodge()) +
  geom_errorbar(aes(ymin = ci.lower, ymax = ci.upper), width = 0.15) + theme_bw() + 
  scale_fill_manual(values = c(  'olivedrab3', 'darkgoldenrod3', 'darkgrey')) +
  # labs(title = paste0(ik, ' - p = ', round(prox.dist.fisher$p,6), ', e = ', round(prox.dist.fisher$estimate,3)), x = '', y = '') + theme_classic() +
  labs(title = paste0(ik), x = '', y = '') + theme_classic() +
  theme(plot.title = element_text(size = 20, face = 'bold'),
        axis.text.x = element_text(size = 0, angle = 0, hjust = 0.5),
        axis.text.y = element_text(size = 22, angle = 0, hjust = 1),
        axis.title.x = element_text(size = 0, face = 'plain'),
        axis.title.y = element_text(size = 5, face = 'bold'),
        axis.ticks.x = element_blank()) + 
  geom_text(mapping = aes(x = clust.int, y = ci.upper + 0.05, label = paste0(nmut, '/', tot)), size = 7) +
  guides(fill = guide_legend(title = '')) + theme(legend.position = "top")

# ID3
ik = 'ID3_75pct'
freq.plot.tmp.f = extd_data_3 
freq.plot.tmp.f$clust.int = freq.plot.tmp.f$CellOfOrigin
freq.plot.tmp.f = freq.plot.tmp.f[!duplicated(freq.plot.tmp.f),]
col.num = which(colnames(freq.plot.tmp.f) == ik)
freq.plot.tmp.f$mut.int = freq.plot.tmp.f[,..col.num]
freq.plot.tmp.f = freq.plot.tmp.f[!is.na(mut.int),]
freq.plot.tmp.f$mut.int = as.numeric(freq.plot.tmp.f$mut.int)
res.plot = freq.plot.tmp.f[, prop.test(sum(mut.int), .N) %>% dflm %>% cbind(nmut = sum(mut.int), tot = .N), by = .(clust.int)][, fracmut := estimate]
res.plot$clust.int = factor(res.plot$clust.int, levels = c('Proximal','Distal', 'Ambiguous'))

ggplot(res.plot, aes(x = clust.int, y = fracmut, fill = clust.int)) +
  geom_bar(stat = 'identity', position = position_dodge()) +
  geom_errorbar(aes(ymin = ci.lower, ymax = ci.upper), width = 0.15) + theme_bw() + 
  scale_fill_manual(values = c(  'olivedrab3', 'darkgoldenrod3', 'darkgrey')) +
  labs(title = paste0(ik), x = '', y = '') + theme_classic() +
  theme(plot.title = element_text(size = 20, face = 'bold'),
        axis.text.x = element_text(size = 0, angle = 0, hjust = 0.5),
        axis.text.y = element_text(size = 22, angle = 0, hjust = 1),
        axis.title.x = element_text(size = 0, face = 'plain'),
        axis.title.y = element_text(size = 5, face = 'bold'),
        axis.ticks.x = element_blank()) + 
  geom_text(mapping = aes(x = clust.int, y = ci.upper + 0.05, label = paste0(nmut, '/', tot)), size = 7) +
  guides(fill = guide_legend(title = '')) + theme(legend.position = "top")

# Apobec
ik = 'Apobec_75pct'
freq.plot.tmp.f = extd_data_3 
freq.plot.tmp.f$clust.int = freq.plot.tmp.f$CellOfOrigin
freq.plot.tmp.f = freq.plot.tmp.f[!duplicated(freq.plot.tmp.f),]
col.num = which(colnames(freq.plot.tmp.f) == ik)
freq.plot.tmp.f$mut.int = freq.plot.tmp.f[,..col.num]
freq.plot.tmp.f = freq.plot.tmp.f[!is.na(mut.int),]
freq.plot.tmp.f$mut.int = as.numeric(freq.plot.tmp.f$mut.int)
res.plot = freq.plot.tmp.f[, prop.test(sum(mut.int), .N) %>% dflm %>% cbind(nmut = sum(mut.int), tot = .N), by = .(clust.int)][, fracmut := estimate]
res.plot$clust.int = factor(res.plot$clust.int, levels = c('Proximal','Distal', 'Ambiguous'))

ggplot(res.plot, aes(x = clust.int, y = fracmut, fill = clust.int)) +
  geom_bar(stat = 'identity', position = position_dodge()) +
  geom_errorbar(aes(ymin = ci.lower, ymax = ci.upper), width = 0.15) + theme_bw() + 
  scale_fill_manual(values = c(  'olivedrab3', 'darkgoldenrod3', 'darkgrey')) +
  labs(title = paste0(ik), x = '', y = '') + theme_classic() +
  theme(plot.title = element_text(size = 20, face = 'bold'),
        axis.text.x = element_text(size = 0, angle = 0, hjust = 0.5),
        axis.text.y = element_text(size = 22, angle = 0, hjust = 1),
        axis.title.x = element_text(size = 0, face = 'plain'),
        axis.title.y = element_text(size = 5, face = 'bold'),
        axis.ticks.x = element_blank()) + 
  geom_text(mapping = aes(x = clust.int, y = ci.upper + 0.05, label = paste0(nmut, '/', tot)), size = 7) +
  guides(fill = guide_legend(title = '')) + theme(legend.position = "top")

# SBS1
ik = 'SBS1_75pct'
freq.plot.tmp.f = extd_data_3 
freq.plot.tmp.f$clust.int = freq.plot.tmp.f$CellOfOrigin
freq.plot.tmp.f = freq.plot.tmp.f[!duplicated(freq.plot.tmp.f),]
col.num = which(colnames(freq.plot.tmp.f) == ik)
freq.plot.tmp.f$mut.int = freq.plot.tmp.f[,..col.num]
freq.plot.tmp.f = freq.plot.tmp.f[!is.na(mut.int),]
freq.plot.tmp.f$mut.int = as.numeric(freq.plot.tmp.f$mut.int)
res.plot = freq.plot.tmp.f[, prop.test(sum(mut.int), .N) %>% dflm %>% cbind(nmut = sum(mut.int), tot = .N), by = .(clust.int)][, fracmut := estimate]
res.plot$clust.int = factor(res.plot$clust.int, levels = c('Proximal','Distal', 'Ambiguous'))

ggplot(res.plot, aes(x = clust.int, y = fracmut, fill = clust.int)) +
  geom_bar(stat = 'identity', position = position_dodge()) +
  geom_errorbar(aes(ymin = ci.lower, ymax = ci.upper), width = 0.15) + theme_bw() + 
  scale_fill_manual(values = c(  'olivedrab3', 'darkgoldenrod3', 'darkgrey')) +
  labs(title = paste0(ik), x = '', y = '') + theme_classic() +
  theme(plot.title = element_text(size = 20, face = 'bold'),
        axis.text.x = element_text(size = 0, angle = 0, hjust = 0.5),
        axis.text.y = element_text(size = 22, angle = 0, hjust = 1),
        axis.title.x = element_text(size = 0, face = 'plain'),
        axis.title.y = element_text(size = 5, face = 'bold'),
        axis.ticks.x = element_blank()) + 
  geom_text(mapping = aes(x = clust.int, y = ci.upper + 0.05, label = paste0(nmut, '/', tot)), size = 7) +
  guides(fill = guide_legend(title = '')) + theme(legend.position = "top")

# SBS5
ik = 'SBS5_75pct'
freq.plot.tmp.f = extd_data_3 
freq.plot.tmp.f$clust.int = freq.plot.tmp.f$CellOfOrigin
freq.plot.tmp.f = freq.plot.tmp.f[!duplicated(freq.plot.tmp.f),]
col.num = which(colnames(freq.plot.tmp.f) == ik)
freq.plot.tmp.f$mut.int = freq.plot.tmp.f[,..col.num]
freq.plot.tmp.f = freq.plot.tmp.f[!is.na(mut.int),]
freq.plot.tmp.f$mut.int = as.numeric(freq.plot.tmp.f$mut.int)
res.plot = freq.plot.tmp.f[, prop.test(sum(mut.int), .N) %>% dflm %>% cbind(nmut = sum(mut.int), tot = .N), by = .(clust.int)][, fracmut := estimate]
res.plot$clust.int = factor(res.plot$clust.int, levels = c('Proximal','Distal', 'Ambiguous'))

ggplot(res.plot, aes(x = clust.int, y = fracmut, fill = clust.int)) +
  geom_bar(stat = 'identity', position = position_dodge()) +
  geom_errorbar(aes(ymin = ci.lower, ymax = ci.upper), width = 0.15) + theme_bw() + 
  scale_fill_manual(values = c(  'olivedrab3', 'darkgoldenrod3', 'darkgrey')) +
  labs(title = paste0(ik), x = '', y = '') + theme_classic() +
  theme(plot.title = element_text(size = 20, face = 'bold'),
        axis.text.x = element_text(size = 0, angle = 0, hjust = 0.5),
        axis.text.y = element_text(size = 22, angle = 0, hjust = 1),
        axis.title.x = element_text(size = 0, face = 'plain'),
        axis.title.y = element_text(size = 5, face = 'bold'),
        axis.ticks.x = element_blank()) + 
  geom_text(mapping = aes(x = clust.int, y = ci.upper + 0.05, label = paste0(nmut, '/', tot)), size = 7) +
  guides(fill = guide_legend(title = '')) + theme(legend.position = "top")

# ID1
ik = 'ID1_75pct'
freq.plot.tmp.f = extd_data_3 
freq.plot.tmp.f$clust.int = freq.plot.tmp.f$CellOfOrigin
freq.plot.tmp.f = freq.plot.tmp.f[!duplicated(freq.plot.tmp.f),]
col.num = which(colnames(freq.plot.tmp.f) == ik)
freq.plot.tmp.f$mut.int = freq.plot.tmp.f[,..col.num]
freq.plot.tmp.f = freq.plot.tmp.f[!is.na(mut.int),]
freq.plot.tmp.f$mut.int = as.numeric(freq.plot.tmp.f$mut.int)
res.plot = freq.plot.tmp.f[, prop.test(sum(mut.int), .N) %>% dflm %>% cbind(nmut = sum(mut.int), tot = .N), by = .(clust.int)][, fracmut := estimate]
res.plot$clust.int = factor(res.plot$clust.int, levels = c('Proximal','Distal', 'Ambiguous'))

ggplot(res.plot, aes(x = clust.int, y = fracmut, fill = clust.int)) +
  geom_bar(stat = 'identity', position = position_dodge()) +
  geom_errorbar(aes(ymin = ci.lower, ymax = ci.upper), width = 0.15) + theme_bw() + 
  scale_fill_manual(values = c(  'olivedrab3', 'darkgoldenrod3', 'darkgrey')) +
  labs(title = paste0(ik), x = '', y = '') + theme_classic() +
  theme(plot.title = element_text(size = 20, face = 'bold'),
        axis.text.x = element_text(size = 0, angle = 0, hjust = 0.5),
        axis.text.y = element_text(size = 22, angle = 0, hjust = 1),
        axis.title.x = element_text(size = 0, face = 'plain'),
        axis.title.y = element_text(size = 5, face = 'bold'),
        axis.ticks.x = element_blank()) + 
  geom_text(mapping = aes(x = clust.int, y = ci.upper + 0.05, label = paste0(nmut, '/', tot)), size = 7) +
  guides(fill = guide_legend(title = '')) + theme(legend.position = "top")

# ID12
ik = 'ID12_75pct'
freq.plot.tmp.f = extd_data_3 
freq.plot.tmp.f$clust.int = freq.plot.tmp.f$CellOfOrigin
freq.plot.tmp.f = freq.plot.tmp.f[!duplicated(freq.plot.tmp.f),]
col.num = which(colnames(freq.plot.tmp.f) == ik)
freq.plot.tmp.f$mut.int = freq.plot.tmp.f[,..col.num]
freq.plot.tmp.f = freq.plot.tmp.f[!is.na(mut.int),]
freq.plot.tmp.f$mut.int = as.numeric(freq.plot.tmp.f$mut.int)
res.plot = freq.plot.tmp.f[, prop.test(sum(mut.int), .N) %>% dflm %>% cbind(nmut = sum(mut.int), tot = .N), by = .(clust.int)][, fracmut := estimate]
res.plot$clust.int = factor(res.plot$clust.int, levels = c('Proximal','Distal', 'Ambiguous'))

ggplot(res.plot, aes(x = clust.int, y = fracmut, fill = clust.int)) +
  geom_bar(stat = 'identity', position = position_dodge()) +
  geom_errorbar(aes(ymin = ci.lower, ymax = ci.upper), width = 0.15) + theme_bw() + 
  scale_fill_manual(values = c(  'olivedrab3', 'darkgoldenrod3', 'darkgrey')) +
  labs(title = paste0(ik), x = '', y = '') + theme_classic() +
  theme(plot.title = element_text(size = 20, face = 'bold'),
        axis.text.x = element_text(size = 0, angle = 0, hjust = 0.5),
        axis.text.y = element_text(size = 22, angle = 0, hjust = 1),
        axis.title.x = element_text(size = 0, face = 'plain'),
        axis.title.y = element_text(size = 5, face = 'bold'),
        axis.ticks.x = element_blank()) + 
  geom_text(mapping = aes(x = clust.int, y = ci.upper + 0.05, label = paste0(nmut, '/', tot)), size = 7) +
  guides(fill = guide_legend(title = '')) + theme(legend.position = "top")

Figure 3D

extd_data_3 <- fread(file.path(params$fileLoc,'extd_data_3.csv'))
ik = 'Never_Smoker'
freq.plot.tmp.f = extd_data_3[!is.na(CellOfOrigin)]
freq.plot.tmp.f$clust.int = freq.plot.tmp.f$CellOfOrigin
freq.plot.tmp.f = freq.plot.tmp.f[!duplicated(freq.plot.tmp.f),]
col.num = which(colnames(freq.plot.tmp.f) == ik)
freq.plot.tmp.f$mut.int = freq.plot.tmp.f[,..col.num]
freq.plot.tmp.f = freq.plot.tmp.f[!is.na(mut.int),]
freq.plot.tmp.f$mut.int = as.numeric(freq.plot.tmp.f$mut.int)
res.plot = freq.plot.tmp.f[, prop.test(sum(mut.int), .N) %>% dflm %>% cbind(nmut = sum(mut.int), tot = .N), by = .(clust.int)][, fracmut := estimate]
res.plot$clust.int = factor(res.plot$clust.int, levels = c('Proximal','Distal', 'Ambiguous'))

ggplot(res.plot, aes(x = clust.int, y = fracmut, fill = clust.int)) +
  geom_bar(stat = 'identity', position = position_dodge()) +
  geom_errorbar(aes(ymin = ci.lower, ymax = ci.upper), width = 0.15) + theme_bw() + 
  scale_fill_manual(values = c(  'olivedrab3', 'darkgoldenrod3', 'darkgrey')) +
  labs(title = paste0(ik), x = '', y = '') + theme_classic() +
  theme(plot.title = element_text(size = 20, face = 'bold'),
        axis.text.x = element_text(size = 0, angle = 0, hjust = 0.5),
        axis.text.y = element_text(size = 22, angle = 0, hjust = 1),
        axis.title.x = element_text(size = 0, face = 'plain'),
        axis.title.y = element_text(size = 5, face = 'bold'),
        axis.ticks.x = element_blank()) + 
  geom_text(mapping = aes(x = clust.int, y = ci.upper + 0.05, label = paste0(nmut, '/', tot)), size = 7) +
  guides(fill = guide_legend(title = '')) + theme(legend.position = "top")

Figure 3E

# KRAS
extd_data_3 <- fread(file.path(params$fileLoc,'extd_data_3.csv'))
ik = 'KRAS'
freq.plot.tmp.f = extd_data_3 
freq.plot.tmp.f$clust.int = freq.plot.tmp.f$CellOfOrigin
freq.plot.tmp.f = freq.plot.tmp.f[!duplicated(freq.plot.tmp.f),]
col.num = which(colnames(freq.plot.tmp.f) == ik)
freq.plot.tmp.f$mut.int = freq.plot.tmp.f[,..col.num]
freq.plot.tmp.f = freq.plot.tmp.f[!is.na(mut.int),]
freq.plot.tmp.f$mut.int = as.numeric(freq.plot.tmp.f$mut.int)
res.plot = freq.plot.tmp.f[, prop.test(sum(mut.int), .N) %>% dflm %>% cbind(nmut = sum(mut.int), tot = .N), by = .(clust.int)][, fracmut := estimate]
res.plot$clust.int = factor(res.plot$clust.int, levels = c('Proximal','Distal', 'Ambiguous'))

ggplot(res.plot, aes(x = clust.int, y = fracmut, fill = clust.int)) +
  geom_bar(stat = 'identity', position = position_dodge()) +
  geom_errorbar(aes(ymin = ci.lower, ymax = ci.upper), width = 0.15) + theme_bw() + 
  scale_fill_manual(values = c(  'olivedrab3', 'darkgoldenrod3', 'darkgrey')) +
  labs(title = paste0(ik), x = '', y = '') + theme_classic() +
  theme(plot.title = element_text(size = 20, face = 'bold'),
        axis.text.x = element_text(size = 0, angle = 0, hjust = 0.5),
        axis.text.y = element_text(size = 22, angle = 0, hjust = 1),
        axis.title.x = element_text(size = 0, face = 'plain'),
        axis.title.y = element_text(size = 5, face = 'bold'),
        axis.ticks.x = element_blank()) + 
  geom_text(mapping = aes(x = clust.int, y = ci.upper + 0.05, label = paste0(nmut, '/', tot)), size = 7) +
  guides(fill = guide_legend(title = '')) + theme(legend.position = "top")

# TP53
ik = 'TP53'
freq.plot.tmp.f = extd_data_3 
freq.plot.tmp.f$clust.int = freq.plot.tmp.f$CellOfOrigin
freq.plot.tmp.f = freq.plot.tmp.f[!duplicated(freq.plot.tmp.f),]
col.num = which(colnames(freq.plot.tmp.f) == ik)
freq.plot.tmp.f$mut.int = freq.plot.tmp.f[,..col.num]
freq.plot.tmp.f = freq.plot.tmp.f[!is.na(mut.int),]
freq.plot.tmp.f$mut.int = as.numeric(freq.plot.tmp.f$mut.int)
res.plot = freq.plot.tmp.f[, prop.test(sum(mut.int), .N) %>% dflm %>% cbind(nmut = sum(mut.int), tot = .N), by = .(clust.int)][, fracmut := estimate]
res.plot$clust.int = factor(res.plot$clust.int, levels = c('Proximal','Distal', 'Ambiguous'))

ggplot(res.plot, aes(x = clust.int, y = fracmut, fill = clust.int)) +
  geom_bar(stat = 'identity', position = position_dodge()) +
  geom_errorbar(aes(ymin = ci.lower, ymax = ci.upper), width = 0.15) + theme_bw() + 
  scale_fill_manual(values = c(  'olivedrab3', 'darkgoldenrod3', 'darkgrey')) +
  labs(title = paste0(ik), x = '', y = '') + theme_classic() +
  theme(plot.title = element_text(size = 20, face = 'bold'),
        axis.text.x = element_text(size = 0, angle = 0, hjust = 0.5),
        axis.text.y = element_text(size = 22, angle = 0, hjust = 1),
        axis.title.x = element_text(size = 0, face = 'plain'),
        axis.title.y = element_text(size = 5, face = 'bold'),
        axis.ticks.x = element_blank()) + 
  geom_text(mapping = aes(x = clust.int, y = ci.upper + 0.05, label = paste0(nmut, '/', tot)), size = 7) +
  guides(fill = guide_legend(title = '')) + theme(legend.position = "top")

# STK11
ik = 'STK11'
freq.plot.tmp.f = extd_data_3 
freq.plot.tmp.f$clust.int = freq.plot.tmp.f$CellOfOrigin
freq.plot.tmp.f = freq.plot.tmp.f[!duplicated(freq.plot.tmp.f),]
col.num = which(colnames(freq.plot.tmp.f) == ik)
freq.plot.tmp.f$mut.int = freq.plot.tmp.f[,..col.num]
freq.plot.tmp.f = freq.plot.tmp.f[!is.na(mut.int),]
freq.plot.tmp.f$mut.int = as.numeric(freq.plot.tmp.f$mut.int)
res.plot = freq.plot.tmp.f[, prop.test(sum(mut.int), .N) %>% dflm %>% cbind(nmut = sum(mut.int), tot = .N), by = .(clust.int)][, fracmut := estimate]
res.plot$clust.int = factor(res.plot$clust.int, levels = c('Proximal','Distal', 'Ambiguous'))

ggplot(res.plot, aes(x = clust.int, y = fracmut, fill = clust.int)) +
  geom_bar(stat = 'identity', position = position_dodge()) +
  geom_errorbar(aes(ymin = ci.lower, ymax = ci.upper), width = 0.15) + theme_bw() + 
  scale_fill_manual(values = c(  'olivedrab3', 'darkgoldenrod3', 'darkgrey')) +
  labs(title = paste0(ik), x = '', y = '') + theme_classic() +
  theme(plot.title = element_text(size = 20, face = 'bold'),
        axis.text.x = element_text(size = 0, angle = 0, hjust = 0.5),
        axis.text.y = element_text(size = 22, angle = 0, hjust = 1),
        axis.title.x = element_text(size = 0, face = 'plain'),
        axis.title.y = element_text(size = 5, face = 'bold'),
        axis.ticks.x = element_blank()) + 
  geom_text(mapping = aes(x = clust.int, y = ci.upper + 0.05, label = paste0(nmut, '/', tot)), size = 7) +
  guides(fill = guide_legend(title = '')) + theme(legend.position = "top")

# EGFR
ik = 'EGFR'
freq.plot.tmp.f = extd_data_3 
freq.plot.tmp.f$clust.int = freq.plot.tmp.f$CellOfOrigin
freq.plot.tmp.f = freq.plot.tmp.f[!duplicated(freq.plot.tmp.f),]
col.num = which(colnames(freq.plot.tmp.f) == ik)
freq.plot.tmp.f$mut.int = freq.plot.tmp.f[,..col.num]
freq.plot.tmp.f = freq.plot.tmp.f[!is.na(mut.int),]
freq.plot.tmp.f$mut.int = as.numeric(freq.plot.tmp.f$mut.int)
res.plot = freq.plot.tmp.f[, prop.test(sum(mut.int), .N) %>% dflm %>% cbind(nmut = sum(mut.int), tot = .N), by = .(clust.int)][, fracmut := estimate]
res.plot$clust.int = factor(res.plot$clust.int, levels = c('Proximal','Distal', 'Ambiguous'))

ggplot(res.plot, aes(x = clust.int, y = fracmut, fill = clust.int)) +
  geom_bar(stat = 'identity', position = position_dodge()) +
  geom_errorbar(aes(ymin = ci.lower, ymax = ci.upper), width = 0.15) + theme_bw() + 
  scale_fill_manual(values = c(  'olivedrab3', 'darkgoldenrod3', 'darkgrey')) +
  labs(title = paste0(ik), x = '', y = '') + theme_classic() +
  theme(plot.title = element_text(size = 20, face = 'bold'),
        axis.text.x = element_text(size = 0, angle = 0, hjust = 0.5),
        axis.text.y = element_text(size = 22, angle = 0, hjust = 1),
        axis.title.x = element_text(size = 0, face = 'plain'),
        axis.title.y = element_text(size = 5, face = 'bold'),
        axis.ticks.x = element_blank()) + 
  geom_text(mapping = aes(x = clust.int, y = ci.upper + 0.05, label = paste0(nmut, '/', tot)), size = 7) +
  guides(fill = guide_legend(title = '')) + theme(legend.position = "top")

# SMARCA4
ik = 'SMARCA4'
freq.plot.tmp.f = extd_data_3 
freq.plot.tmp.f$clust.int = freq.plot.tmp.f$CellOfOrigin
freq.plot.tmp.f = freq.plot.tmp.f[!duplicated(freq.plot.tmp.f),]
col.num = which(colnames(freq.plot.tmp.f) == ik)
freq.plot.tmp.f$mut.int = freq.plot.tmp.f[,..col.num]
freq.plot.tmp.f = freq.plot.tmp.f[!is.na(mut.int),]
freq.plot.tmp.f$mut.int = as.numeric(freq.plot.tmp.f$mut.int)
res.plot = freq.plot.tmp.f[, prop.test(sum(mut.int), .N) %>% dflm %>% cbind(nmut = sum(mut.int), tot = .N), by = .(clust.int)][, fracmut := estimate]
res.plot$clust.int = factor(res.plot$clust.int, levels = c('Proximal','Distal', 'Ambiguous'))

ggplot(res.plot, aes(x = clust.int, y = fracmut, fill = clust.int)) +
  geom_bar(stat = 'identity', position = position_dodge()) +
  geom_errorbar(aes(ymin = ci.lower, ymax = ci.upper), width = 0.15) + theme_bw() + 
  scale_fill_manual(values = c(  'olivedrab3', 'darkgoldenrod3', 'darkgrey')) +
  labs(title = paste0(ik), x = '', y = '') + theme_classic() +
  theme(plot.title = element_text(size = 20, face = 'bold'),
        axis.text.x = element_text(size = 0, angle = 0, hjust = 0.5),
        axis.text.y = element_text(size = 22, angle = 0, hjust = 1),
        axis.title.x = element_text(size = 0, face = 'plain'),
        axis.title.y = element_text(size = 5, face = 'bold'),
        axis.ticks.x = element_blank()) + 
  geom_text(mapping = aes(x = clust.int, y = ci.upper + 0.05, label = paste0(nmut, '/', tot)), size = 7) +
  guides(fill = guide_legend(title = '')) + theme(legend.position = "top")

Figure 4

Figure 4A

library(skitools)
library(ggalluvial)
library(wesanderson)

pts_coo_id <- readRDS(file.path(params$fileLoc,'pts_coo_id.rds'))
pts_coo_id$Origin = factor(pts_coo_id$Origin, levels = c(  "Distal Lung_TP53 MUT", "Ambiguous_TP53 MUT", "Proximal Lung_TP53 MUT",  "Distal Lung_WT", "Ambiguous_WT", "Proximal Lung_WT"))
pts_coo_id$Identity = factor(pts_coo_id$Identity, levels = c("Distal Lung", "Non-Distal Lung"))

ggplot(pts_coo_id,
       aes(y = Patient_Frequency,
           axis1 = Origin,
           axis2 = Identity)) +
  theme_bw() +
  geom_alluvium(aes(fill = Origin), width = 1/12) +
  geom_stratum(width = 1/12, discern = TRUE, fill = "grey90", color = "black") +
  geom_label(stat = "stratum", size = 3.5, aes(label = after_stat(stratum))) +
  scale_x_discrete(limits = c("Origin", "Identity"), expand = c(0.05, 0.05)) +
  scale_fill_manual(values = c(wes_palettes$GrandBudapest2[4], wes_palettes$Chevalier1[1], wes_palettes$GrandBudapest2[3], wes_palettes$Chevalier1[2], wes_palettes$GrandBudapest2[2], wes_palettes$Chevalier1[3], wes_palettes$AsteroidCity3[1], wes_palettes$AsteroidCity1[1])) +
  ggtitle("Fig 4A - Origin --> Identity") + coord_flip()

Figure 4B

pts_coo_id_4B <- readRDS(file.path(params$fileLoc,'pts_coo_id_4B.rds'))

res.plot =  pts_coo_id_4B[, prop.test(sum(Identity == 'Distal Lung'), .N) %>% dflm %>% cbind(nprox = sum(Identity == 'Distal Lung'), tot = .N), by = .(TP53_mut = ifelse(TP53_mut, 'TP53 MUT', 'WT'), Origin)][, fracprox := estimate]
res.plot$Origin = factor(res.plot$Origin, levels = c('Non-Distal Lung','Distal Lung'))
res.plot$TP53_mut = factor(res.plot$TP53_mut, levels = c('TP53 MUT','WT'))
ggplot(res.plot, aes(x = TP53_mut, y = fracprox, fill = TP53_mut)) +
  geom_bar(stat = 'identity', position = position_dodge()) +
  geom_errorbar(aes(ymin = ci.lower, ymax = ci.upper), width = 0.15) + theme_bw() + 
  facet_grid(~Origin) +
  scale_fill_manual(values = c(wes_palettes$Royal1[2], wes_palettes$Royal1[1])) +
  labs(title = '', x = '', y = '') + theme_classic() +
  theme(plot.title = element_text(size = 0, face = 'bold'),
        axis.text.x = element_text(size = 0, angle = 0, hjust = 0.5),
        axis.text.y = element_text(size = 22, angle = 0, hjust = 1),
        axis.title.x = element_text(size = 0, face = 'plain'),
        axis.title.y = element_text(size = 5, face = 'bold'),
        axis.ticks.x = element_blank()) + 
  geom_text(mapping = aes(x = TP53_mut, y = ci.upper + 0.05, label = paste0(nprox, '/', tot)), size = 7) +
  guides(fill = guide_legend(title = 'Fig 4B - Distal fraction')) + theme(legend.position = "bottom")

Figure 5

Figure 5B - Left Panel

#Load libraries
library(skitools)
library(parallel)
library(MASS)

#Load csv with relative risk information per centroid for every sample.
rel_risk = read.table(file.path(params$fileLoc,"Fig5B_luad_rel_risk.csv"),sep=",",header=T,row.names=1)

#Group samples based on Distal, Proximal, or Ambiguous COO calls. Compute mean relative risk for each.
#Distal
distal = rel_risk[which(rel_risk$infer_coo_non_apobec_sigp=="Distal Lung"),]
distal_ep = distal[,1:23]
distal_ep = t(distal_ep)
distal_ep_mean = as.matrix(rowMeans(distal_ep))
distal_ep_mean = as.data.frame(distal_ep_mean)
distal_ep_mean$celltype = rownames(distal_ep_mean)
#Proximal
proximal = rel_risk[which(rel_risk$infer_coo_non_apobec_sigp=="Proximal Lung"),]
proximal_ep = proximal[,1:23]
proximal_ep = t(proximal_ep)
proximal_ep_mean = as.matrix(rowMeans(proximal_ep))
proximal_ep_mean = as.data.frame(proximal_ep_mean)
proximal_ep_mean$celltype = rownames(proximal_ep_mean)
#Ambiguous
ambi = rel_risk[which(rel_risk$infer_coo_non_apobec_sigp=="Ambiguous"),]
ambi_ep = ambi[,1:23]
ambi_ep = t(ambi_ep)
ambi_ep_mean = as.matrix(rowMeans(ambi_ep))
ambi_ep_mean = as.data.frame(ambi_ep_mean)
ambi_ep_mean$celltype = rownames(ambi_ep_mean)

#Load WCM-1 GLM output. Get vector with relative risks per centroid.
wcm = readRDS(file.path(params$fileLoc,"Fig5B_cov_res.rds"))
wcm_mat = wcm[,c("celltype","estimate")]
wcm_mat = as.data.frame(wcm_mat)
rownames(wcm_mat) = wcm_mat$celltype
wcm_mat1 = as.data.frame(wcm_mat[,2])
rownames(wcm_mat1) = rownames(wcm_mat)
colnames(wcm_mat1) = "estimate"

#Compute euclidean distances between average relative risk vectors of distal, proximal, or ambiguous samples.
CalculateEuclideanDistance <- function(vect1, vect2) sqrt(sum((vect1 - vect2)^2))
distal_dist = CalculateEuclideanDistance(distal_ep_mean$V1, wcm_mat1$estimate)
prox_dist = CalculateEuclideanDistance(proximal_ep_mean$V1, wcm_mat1$estimate)
ambi_dist = CalculateEuclideanDistance(ambi_ep_mean$V1, wcm_mat1$estimate)
distances = c(distal_dist = CalculateEuclideanDistance(distal_ep_mean$V1, wcm_mat1$estimate),
              prox_dist = CalculateEuclideanDistance(proximal_ep_mean$V1, wcm_mat1$estimate),
              ambi_dist = CalculateEuclideanDistance(ambi_ep_mean$V1, wcm_mat1$estimate))

#Results are saved in the distances data.frame object (figure was manually made based on this object).
distances = as.data.frame(distances)
distances$Vector = rownames(distances)
DT::datatable(distances, options = list(scrollX = TRUE), caption = "Distances Table")

Figure 5B - Right Panel

#Load libraries
library(skitools)

#Load GLM output for WCM-1.
dt.plot = readRDS(file.path(params$fileLoc,"Fig5B_cov_res.rds"))
dt.plot$Cell_Type = dt.plot$celltype
dt.plot$tumor.type = "LUAD"

#Center relative risk and confidence interval estimate values at 1 for plot.
dt.plot$estimate = dt.plot$estimate - 1
dt.plot$ci.lower = dt.plot$ci.lower - 1
dt.plot$ci.upper = dt.plot$ci.upper - 1

# Relative risk barplots. The code allows for filtering for specific cancer type, snv columns, and celltypes via in.pattern.tt, in.pattern.snv, and in.ct.sub respectively. Here we add the values needed for plotting all GLM outputs defined above with no restriction. Note that multiple cancer subtype grpahs can be done by adding more values to in.pattern.tt. 
in.pattern.tt = c('LUAD')
in.pattern.snv = "snv.count"

for (ik in in.pattern.tt)
  {
    message('\nPlotting mutational density for: ', ik, ' samples')
#Filter for any cancer type, celltype, or snv column if needed.     
    in.ct.sub = dt.plot$celltype
    in.pattern = in.pattern.snv
    in.count.sub = grep(paste0(in.pattern),dt.plot$tt, value = TRUE)
    dt.plot.sub = dt.plot[tumor.type %in% ik & celltype %in% in.ct.sub & tt %in% in.count.sub,]
#Order results by relative risk (lower to higher).         
    setorder(dt.plot.sub,celltype)
    in.title = unique(gsub(" - .*","",dt.plot.sub$combination.formal))
# Set upper and lower limits for plot (x axis).
    in.axis.breaks = 0.05
    in.axis.min = -0.8
    in.axis.max = 10.05
#Construct ggplot2 object and plot.        
    p = ggplot(dt.plot.sub, aes(x = reorder(celltype,estimate), y = estimate)) +
      geom_bar(stat = "identity", alpha = 0.9) +
      scale_fill_manual(values = c("grey")) +
      geom_errorbar(aes(ymax = ci.upper, ymin = ci.lower), size = 0.85, width = 0.25, color = 'black') +
      scale_y_continuous(breaks = seq(in.axis.min-0.05, in.axis.max+0.05, in.axis.breaks), labels = function(y) y + 1) +
      theme_bw() +
      theme(panel.grid.minor = element_blank()) +
      xlab("") +
      ylab("Relative Risk") +
      ggtitle(paste0(in.title)) +
      theme(axis.text.y = element_text(size = 10, angle = 0, vjust = 0.5, hjust = 1)) +
      coord_flip()
    print(p)
}

Figure 5C

#Load libraries. 
library(skitools)
#Load file with umap of cancer cells output. This allows for gathering WCM-1A and B cells. 
tmp.plot = readRDS(file.path(params$fileLoc,"Fig5_tmp_plot_for_umap_fig5_knn6.rds"))

#Filter for WCM-1 cells, split in WCM-1 A and B based on their cluster numbers.
tmp_plot_wcm = tmp.plot[which(tmp.plot$patient=="WCM-1"),]
wcm_clusters = tmp_plot_wcm[which(tmp_plot_wcm$cluster!=1),]
wcm_clusters[which(wcm_clusters$cluster==2),"type"]="WCM-1-A"
wcm_clusters[which(wcm_clusters$cluster==3),"type"]="WCM-1-B"
wcm_clusters = as.data.frame(wcm_clusters)
wcm1a = wcm_clusters[which(wcm_clusters$type=="WCM-1-A"),]
wcm1b = wcm_clusters[which(wcm_clusters$type=="WCM-1-B"),]

#Load aneuploidy score output from CONICSmat.
tmp_norm = readRDS(file.path(params$fileLoc,"Fig5C_anneuploidy_zscore.rds"))
tmp_norm = as.data.frame(tmp_norm)

#Get aneuploidy score for WCM-1 cells.
tmp_norm_wcm = tmp_norm[match(wcm_clusters$cell,rownames(tmp_norm)),]
tmp_norm_wcm = as.matrix(tmp_norm_wcm)
tmp_norm_wcm1a = tmp_norm_wcm[match(wcm1a$cell,rownames(tmp_norm_wcm)),]
tmp_norm_wcm1b = tmp_norm_wcm[match(wcm1b$cell,rownames(tmp_norm_wcm)),]
tmp_norm_wcm = rbind(tmp_norm_wcm1a,tmp_norm_wcm1b)
tmp_norm_wcm = as.matrix(tmp_norm_wcm)

#Generate heatmap annotation and then plot heatmap.
row.ha = HeatmapAnnotation(Patient = wcm_clusters$type, name = 'Tissue', width = unit(2, "cm"), show_legend = TRUE, show_annotation_name = FALSE, col = list(Patient = c("WCM-1-A"= "#00BA38", "WCM-1-B" = "#619CFF")), annotation_legend_param = list("WCM-1-A", "WCM-1-B"), which = "row")
p = Heatmap(tmp_norm_wcm, name = 'z-score', cluster_rows = FALSE, cluster_columns = FALSE, row_names_gp = gpar(fontsize = 9), column_names_gp = gpar(fontsize = 8), column_names_side = c("bottom"), show_column_names = TRUE, show_row_names = FALSE, column_title_gp = gpar(fontsize = 20, fontface = "bold"))
print(p)

Figure 5D

#Load libraries 
library(skitools)

#Load per cell identity labels. Remove cells with no identity.
labels = read.table(file.path(params$fileLoc,"Fig5D_combined_emb_for_carc_for_homogenous_sikk_ep_500_nw.csv"),sep=",",header=TRUE)
labels$ann_finest_lev_transferred_label_filtered = ifelse(labels$ann_finest_lev_transfer_uncert<0.5,labels$ann_finest_lev_transferred_label_unfiltered,"Unknown")

#Define proximal and distal cell groupings.
distal_sikk = c("AT1","AT2","AT2 proliferating","AT0")
rare_sikk = c("Neuroendocrine","Tuft","Ionocyte")
cells = c(distal_sikk,rare_sikk)
proximal_sikk = setdiff(labels$ann_finest_lev_transferred_label_filtered,cells)

#Get WCM-1 A and B cells from the carcinoma cell UMAP  (same as for Fig 5C).
tmp.plot = readRDS(file.path(params$fileLoc,"Fig5_tmp_plot_for_umap_fig5_knn6.rds"))
tmp_plot_wcm = tmp.plot[which(tmp.plot$patient=="WCM-1"),]
wcm_clusters = tmp_plot_wcm[which(tmp_plot_wcm$cluster!=1),]
wcm_clusters[which(wcm_clusters$cluster==2),"type"]="WCM-1-A"
wcm_clusters[which(wcm_clusters$cluster==3),"type"]="WCM-1-B"
wcm_clusters = as.data.frame(wcm_clusters)
wcm1a = wcm_clusters[which(wcm_clusters$type=="WCM-1-A"),]
wcm1b = wcm_clusters[which(wcm_clusters$type=="WCM-1-B"),]

#For WCM-1 A and B, get the cell labels and assign Distal/Non-Distal to the corresponding celltypes.
wcm1a_labels = labels[match(wcm1a$cell,labels$CellBarcode),]
wcm1a_labels = wcm1a_labels[which(wcm1a_labels$ann_finest_lev_transferred_label_filtered!="Unknown"),]
wcm1a_DND = as.data.frame(table(wcm1a_labels$ann_finest_lev_transferred_label_filtered))
wcm1a_DND$pat = "WCM-1-A"
wcm1a_DND$cat = "Non-Distal"
wcm1a_DND[which(wcm1a_DND$Var1 %in% distal_sikk),"cat"] = "Distal"

wcm1b_labels = labels[match(wcm1b$cell,labels$CellBarcode),]
wcm1b_labels = wcm1b_labels[which(wcm1b_labels$ann_finest_lev_transferred_label_filtered!="Unknown"),]
wcm1b_DND = as.data.frame(table(wcm1b_labels$ann_finest_lev_transferred_label_filtered))
wcm1b_DND$pat = "WCM-1-B"
wcm1b_DND$cat = "Non-Distal"
wcm1b_DND[which(wcm1b_DND$Var1 %in% distal_sikk),"cat"] = "Distal"

#Combine and compute percentage per celltype per subpopulation..
wcm1_DND = rbind(wcm1a_DND,wcm1b_DND)
result <- wcm1_DND %>%
  group_by(pat) %>%
  mutate(percentage = Freq / sum(Freq) * 100)

#Define color palette and plot.
library(RColorBrewer)
palette1 = brewer.pal(12, "Set3")
palette2 = brewer.pal(5, "Set2")
combined_palette = c(palette1, palette2)

p = ggplot(result, aes(x = factor(cat), y = percentage, fill = Var1)) +
  geom_bar(stat = "identity") +                # Create barplot
  theme_bw() +                                 # Use a clean theme
#  theme(legend.position = "none") +            # Remove legend
  facet_wrap(~ pat) +         # Create separate barplots for each 'pat'
  labs(x = "Frequency", y = "Percentage",      # Label axes
       title = "Barplot of Percentage by Frequency and Pat") + 
  scale_fill_manual(values = combined_palette)+ylim(0,100)

print(p)

Figure 5E

#Load libraries
library(skitools)
library(EnhancedVolcano)

#Load file with marker genes.
seu.markers = readRDS(file.path(params$fileLoc,"Fig5E_seu_markers.rds"))

#Filter for markers from WCM-1 A and B (based on their cluster number in the carcinoma cell umap).
markers.1 = seu.markers[seu.markers$cluster == 2,]
markers.1$wcm = 'WCM-1-A'
markers.1$avg_log2FC = markers.1$avg_log2FC * -1
markers.2 = seu.markers[seu.markers$cluster == 3,]
markers.2$wcm = 'WCM-1-B'

#Combine the markers and plot.
seu.markers.mast.plot = rbind(markers.1, markers.2)
p = EnhancedVolcano(seu.markers.mast.plot,
                    lab = seu.markers.mast.plot$gene,
                    x = 'avg_log2FC', y = 'p_val',
                    xlim = c(-3, 3),
                    title = 'WCM-1-A (Alveolar-like) v. WCM-1-B (Basal-like)',
                    subtitle = "",
                    pCutoff = 10e-10,
                    FCcutoff = 0.5,
                    labSize = 3, colAlpha = 1,
                    pointSize = 2)
print(p)

Extended Data Figure 1

Extended Data Figure 1A

#Load the required libraries.
library(skitools)
library(data.table)
library(skidb)
library(plyr)
library(dplyr)
library(MASS)
library(wesanderson)
library(readxl)
library(ggalluvial)
library(readxl)
library(effects)
library(stats)
library(forcats) 
library(ggpubr)
library(ggforce)
library(ComplexHeatmap)
library(Seurat)

#Load the expression matrix and the cluster data.
expDat = readRDS(file.path(params$fileLoc,"sup1AExp.rds"))
clustDat = readRDS(file.path(params$fileLoc,"sup1AClusters.rds"))

#Construct the seurat object.
seu = CreateSeuratObject(counts = expDat)
seu@meta.data$seurat_clusters = clustDat

#Generate the Dotplot.
p = DotPlot(seu, features = rownames(expDat), group.by="seurat_clusters") + theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(p)

Extended Data Figure 1B & 1C

#Load required libraries.
library(skitools)
library(Seurat)
library(ggpubr)

#Load seurat metadata (with clustering results and cell names), load per cell aneploidy scores, and load shannon entropy scores per cluster.
tmp.plot = data.table(readRDS(file.path(params$fileLoc,"seuMetaData.rds")))
tmp.l.norm = readRDS(file.path(params$fileLoc,"anneuploidy zscore.rds"))
exp_shannon = readRDS(file.path(params$fileLoc,"clusterEntropy.rds"))

#Create table of total aneuploidy score per cell. Add to the metadata table.
aneu.score = data.table(cell = rownames(tmp.l.norm), Aneuplody.Score = apply(tmp.l.norm,1,function(x) sum(abs(x))))
tmp.plot = merge(tmp.plot,aneu.score,by = 'cell')

#Compute aneuploidy score per cluster. 
tmp.df = aggregate(Aneuplody.Score ~ seurat_clusters, data = tmp.plot, FUN = mean)
colnames(tmp.df)[2] = 'Aneuploidy.Score.mean'
#Add diversity (entropy) value per cluster. 
tmp.df$S.Diversity.num = exp_shannon


#Label carcinoma cluster based on high aneuploidy and low entropy.
tmp.df$cat = ifelse(((tmp.df$S.Diversity.num < 10) &(tmp.df$Aneuploidy.Score.mean > 40)),"Carcinoma","Others")
tmp.nw = tmp.plot[,.(Tissue,seurat_clusters)]
tmp.nw = as.data.frame(tmp.nw)
tmp.dt = tmp.df
tmp.df = as.data.frame(tmp.df)
tmp.df = merge(tmp.df,tmp.nw,by='seurat_clusters')
tmp.df.nw = tmp.df %>%  distinct(seurat_clusters, .keep_all = TRUE)
tmp.df.nw = as.data.table(tmp.df.nw)
#Add annotations for non-carcinoma clusters based on their associated tissue (tumor associated epithlial or normal).
tmp.df.nw[which((tmp.df.nw$Tissue == "Tumor" | tmp.df.nw$Tissue == "Metastasis") & (tmp.df.nw$cat == "Others")),"cat"] = "Tumor Associated Epithelial"
tmp.df.nw[which(tmp.df.nw$Tissue== "Adjacent Lung") & (tmp.df.nw$cat == "Others"),"cat"] = "Normal"
tmp.df.nw[13,'cat']="Carcinoma"

#Generate scatterplot for Sup 1B.
p = ggscatter(tmp.df.nw, x = "S.Diversity.num", y = "Aneuploidy.Score.mean",
              color = 'cat',
              repel = TRUE, xlab = "Shannon's Diversity", ylab = 'Aneuploidy Score (mean)')
print(p)

#Link carcinoma and non-carcinoma annotations to each cell.
tmp.plot$cat = ""
for(i in 1:nrow(tmp.df.nw)){
    tmp.plot[tmp.plot$seurat_clusters==tmp.df.nw$seurat_clusters[i],]$cat = tmp.df.nw$cat[i]  
    }
#Organize cells by annotation.
tmp.plot$cat = factor(tmp.plot$cat, levels = c("Normal","Tumor Associated Epithelial","Carcinoma"))
tmp.plot = tmp.plot[order(tmp.plot$cat),]                   

#Organize rows in aneuploidy output for heatmap.
tmp.l.nw = tmp.l.norm[match(tmp.plot$cell,rownames(tmp.l.norm)),]

#Set heatmap annotation and color.
row.ha = HeatmapAnnotation(Tissue = tmp.plot$cat, name = 'Tissue', width = unit(2, "cm"), show_legend = TRUE, show_annotation_name = FALSE, col = list(Tissue = c("Carcinoma" = "#F8766D", "Normal" = "#00BA38", "Tumor Associated Epithelial" = "#619CFF")), annotation_legend_param = list("Normal", "Tumor Epithelial", "Carcinoma"), which = "row")
library(circlize)
col_fun = colorRamp2(c(-10, 0, 10), c("blue", "white", "red"))
#Plot heatmap.
p = Heatmap(tmp.l.nw, name = 'z-score', col = col_fun, cluster_rows = FALSE, cluster_columns = FALSE, row_names_gp = gpar(fontsize = 9), column_names_gp = gpar(fontsize = 8), column_names_side = c("bottom"), show_column_names = TRUE, show_row_names = FALSE, column_title_gp = gpar(fontsize = 20, fontface = "bold")) + row.ha
print(p)

Extended Data Figure 2

#Load requires libraries.
library(skitools)
library(data.tree)

#Load sikkema metadata table. Format data so that each row represents a full hierarchy for a given celltype.
sikkema_meta = read.table(file.path(params$fileLoc,"sikk_meta_epi.csv"),header=T,sep=",")
sikkema_meta = sikkema_meta[,c("ann_level_1","ann_level_2","ann_level_3","ann_level_4","ann_finest_level")]

#Adjust AT0 annotation to conform with Alveolar annotation.
sikkema_meta[sikkema_meta$ann_finest_level=="AT0",]$ann_level_2 = "Alveolar epithelium"
sikkema_meta[sikkema_meta$ann_finest_level=="AT0",]$ann_level_3 = "AT2"
sikkema_meta[sikkema_meta$ann_finest_level=="AT0",]$ann_level_4 = "AT0"

#Populate AT1 and AT2 annotations for level 3 (i.e.: replace curent None value with the cell name).
sikkema_meta[sikkema_meta$ann_finest_level=="AT2",]$ann_level_4 = "AT2"
sikkema_meta[sikkema_meta$ann_finest_level=="AT1",]$ann_level_4 = "AT1"

formatted_data <- apply(sikkema_meta, 1, function(row) {   
  paste0("level1: ", row[1], ", level2: ", row[2], ", level3: ", row[3], 
         ", level4: ", row[4], ", level5: ", row[5], ", finest_level: ", row[6]) 
})

# Convert the formatted data to a data frame (optional but makes it easier to work with).
formatted_data <- data.frame(formatted_data)

# Add a new column 'pathString' by concatenating non-NA values for each row with '/'
sikkema_meta$pathString <- apply(sikkema_meta, 1, function(row) {   
  paste(na.omit(row), collapse = "/")  # Concatenate non-NA values with "/"
})

#Convert cell level hierarchy into graph.
hierarchy_tree <- as.Node(sikkema_meta)
tree_graph <- ToDiagrammeRGraph(hierarchy_tree)
tree_graph <- DiagrammeR::add_global_graph_attrs(
  graph = tree_graph,
  attr = "rankdir",
  value = "LR",   # 'LR' stands for Left-to-Right layout
  attr_type = "graph"
)
tree_graph <- DiagrammeR::add_global_graph_attrs(
  graph = tree_graph,
  attr = "fontsize",
  value = "30",   # Adjust the font size (default is usually smaller)
  attr_type = "node"
)

# Render the left-to-right tree plot.
DiagrammeR::render_graph(tree_graph)

Extended Data Figure 3

#Load libraries.
library(skitools)

#Load file with cell metadata and identity call results.
normal_labs = fread(file.path(params$fileLoc,"combined_emb_for_normal_for_homogenous_sikk_ep_nw5.csv"))
#Filter results based on uncertainty of identity calls. If uncertainty is lower than 0.5, label as unknown.
normal_labs$ann_finest_lev_transferred_label_filtered = ifelse(normal_labs$ann_finest_lev_transfer_uncert < 0.5,normal_labs$ann_finest_lev_transferred_label_unfiltered,"Unknown")
#Filter for identity call results and true identity labels.
normal.mat = normal_labs[,c("V1","ann_finest_lev_transferred_label_filtered","major.ident.f","ref_or_query")]

#Filter for cells used as query (i.e.: not used for training). Remove all cells with unknown labels, then adjust labels based on celltype identity calls.
query_mat = normal.mat[which(normal.mat$ref_or_query=="query"),]
query_mat = query_mat[which(query_mat$ann_finest_lev_transferred_label_filtered!="Unknown"),]
AT2 = c("AT2","AT0","AT2 proliferating")
AT1 = c("AT1")
smg = c("SMG serous (bronchial)","SMG mucous","SMG duct","SMG serous (nasal)")
ciliated = c("Multiciliated (non-nasal)","Multiciliated (nasal)","Deuterosomal")
basal = c("Basal resting","Suprabasal","Hillock-like")
secretory = c("Goblet (nasal)","Club (nasal)","Club (non-nasal)","pre-TB secretory","Goblet (bronchial)","Goblet (subsegmental)")
other = c("Tuft","Ionocyte","Neuroendocrine")
query_mat[query_mat$ann_finest_lev_transferred_label_filtered %in% AT2,"transfered_labs"]="AT2"
query_mat[query_mat$ann_finest_lev_transferred_label_filtered %in% AT1,"transfered_labs"]="AT1"
query_mat[query_mat$ann_finest_lev_transferred_label_filtered %in% smg,"transfered_labs"]="SMG"
query_mat[query_mat$ann_finest_lev_transferred_label_filtered %in% ciliated,"transfered_labs"]="ciliated"
query_mat[query_mat$ann_finest_lev_transferred_label_filtered %in% basal,"transfered_labs"]="basal"
query_mat[query_mat$ann_finest_lev_transferred_label_filtered %in% rare,"transfered_labs"]="other"
query_mat[query_mat$ann_finest_lev_transferred_label_filtered %in% secretory,"transfered_labs"]="secretory"
query_mat[query_mat$major.ident.f %in% c("Alveolar.Type.II.cells"),"orig_labs"]="AT2"
query_mat[query_mat$major.ident.f %in% c("Club.cells"),"orig_labs"]="secretory"
query_mat[query_mat$major.ident.f %in% c("Alveolar.Type.I.cells"),"orig_labs"]="AT1"
query_mat[query_mat$major.ident.f %in% c("Basal.cells"),"orig_labs"]="basal"
query_mat[query_mat$major.ident.f %in% c("Ciliated.cells"),"orig_labs"]="ciliated"
query_mat[query_mat$major.ident.f %in% c("Neuroendocrine.cells"),"orig_labs"]="other"

#Compute accuracy by comparing identity calls vs true identities.
AT2_mat = query_mat[which(query_mat$orig_labs=="AT2"),]
AT2_acc = sum(AT2_mat$transfered_labs==AT2_mat$orig_labs)/nrow(AT2_mat)

AT1_mat = query_mat[which(query_mat$orig_labs=="AT1"),]
AT1_acc = sum(AT1_mat$transfered_labs==AT1_mat$orig_labs)/nrow(AT1_mat)

secretory_mat = query_mat[which(query_mat$orig_labs=="secretory"),]
secretory_acc = sum(secretory_mat$transfered_labs==secretory_mat$orig_labs)/nrow(secretory_mat)

basal_mat = query_mat[which(query_mat$orig_labs=="basal"),]
basal_acc = sum(basal_mat$transfered_labs==basal_mat$orig_labs)/nrow(basal_mat)

rare_mat = query_mat[which(query_mat$orig_labs=="other"),]
rare_acc = sum(rare_mat$transfered_labs==rare_mat$orig_labs)/nrow(rare_mat)

ciliated_mat = query_mat[which(query_mat$orig_labs=="ciliated"),]
ciliated_acc = sum(ciliated_mat$transfered_labs==ciliated_mat$orig_labs)/nrow(ciliated_mat)

#Combine results into a matrix and plot.
acc_mat = c(AT2_acc,AT1_acc,secretory_acc,basal_acc,ciliated_acc,rare_acc)
acc_mat = as.matrix(acc_mat)
rownames(acc_mat) = c("AT2","AT1","secretory","basal","ciliated","other")
acc_mat = as.data.frame(acc_mat)
acc_mat$cat = rownames(acc_mat)

p = ggplot(acc_mat, aes(x = cat, y = V1)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  theme_minimal() + coord_flip()
print(p)

Extended Data Figure 4

Extended Data Figure 4A

#Load required libraries.
library(skitools)

#Load GRanges with centroid expression data.
juan.genes.gr.nw = readRDS(file.path(params$fileLoc,"genes_gr_LUSC.rds"))
juan.genes.gr.nw.dt = gr2dt(juan.genes.gr.nw)

#Define quantiles for partitioning the genes by basal resting expression.
sup.quant <- quantile(juan.genes.gr.nw.dt$Basal.resting)
sup.quant.2 <- quantile(juan.genes.gr.nw.dt$Basal.resting, probs = c(0.33, 0.67, 1))

#Define a data.frame starting from the expression information.
sikk_sup <- juan.genes.gr.nw.dt$Basal.resting %>% as.data.frame()
#Add quantile information and gene name.
colnames(sikk_sup)[1] = 'Goblet'
sikk_sup$gene <- juan.genes.gr.nw.dt$gene_name
lowsup.quant <- sikk_sup[which(sikk_sup$Goblet <= -6.888385), ]
midsup.quant <- sikk_sup[which(sikk_sup$Goblet > -6.888385  & sikk_sup$Goblet <= -6.101867) , ]
highsup.quant <- sikk_sup[which(sikk_sup$Goblet > -6.101867) , ]
lowsup.quant$Quart = 'Q1'
midsup.quant$Quart = 'Q2'
highsup.quant$Quart = 'Q3'
sup.quart.mat <- rbind(lowsup.quant,midsup.quant,highsup.quant)

#Load GRanges with snv counts information per gene for LUAD. Convert to mutational density.
mgenes= readRDS(file.path(params$fileLoc,"mgenes.rds"))
mgenes2 <- copy(mgenes)
juan.genes.gr.nw.dt$densityLUSC <- (10**6)*juan.genes.gr.nw.dt$snv.count/(juan.genes.gr.nw.dt$width*53)

#Make sure quantile data.frame gene order matches snv counts GRanges gene order. Remove NAs. Then add gene mutational density information for LUAD to the data.frame.
supLUSC_centwrtTMB = sup.quart.mat[match(juan.genes.gr.nw.dt$gene_name,sup.quart.mat$gene),]
supLUSC_centwrtTMB = supLUSC_centwrtTMB[complete.cases(supLUSC_centwrtTMB),]
sup.luscmat = cbind(supLUSC_centwrtTMB,juan.genes.gr.nw.dt$densityLUSC)
colnames(sup.luscmat)[[4]] = "densityLUSC"

#Compute mean and standard deviation values for mutational density per expression quantile.
sup.luscmat.Q1 = sup.luscmat[which(sup.luscmat$Quart=="Q1"),]
sup.luscmat.Q1mean = mean(sup.luscmat.Q1$densityLUSC)
sup.luscmat.Q1se = sd(sup.luscmat.Q1$densityLUSC)/sqrt(length(sup.luscmat.Q1$densityLUSC)) 
sup.luscmat.Q2 = sup.luscmat[which(sup.luscmat$Quart=="Q2"),]
sup.luscmat.Q2mean = mean(sup.luscmat.Q2$densityLUSC)
sup.luscmat.Q2se = sd(sup.luscmat.Q2$densityLUSC)/sqrt(length(sup.luscmat.Q2$densityLUSC))
sup.luscmat.Q3 = sup.luscmat[which(sup.luscmat$Quart=="Q3"),]
sup.luscmat.Q3mean = mean(sup.luscmat.Q3$densityLUSC)
sup.luscmat.Q3se = sd(sup.luscmat.Q3$densityLUSC)/sqrt(length(sup.luscmat.Q3$densityLUSC))
#Save mean and sd values as a new data.frame.
mean_sup_mat_lusc = as.data.frame(rbind(sup.luscmat.Q1mean,sup.luscmat.Q2mean,sup.luscmat.Q3mean))
se_sup_mat_lusc = as.data.frame(rbind(sup.luscmat.Q1se,sup.luscmat.Q2se,sup.luscmat.Q3se))
Q1.quart = "Q1"
Q2.quart = "Q2"
Q3.quart = "Q3"
quart.mat = as.data.frame(rbind(Q1.quart,Q2.quart,Q3.quart))
sup.mean.mat = cbind(quart.mat,mean_sup_mat_lusc,se_sup_mat_lusc)
rownames(sup.mean.mat) = c(1,2,3)
colnames(sup.mean.mat) = c("Quart","Mean_TMB","SE")

#Do the same process for AT2 expression. Define quantiles.
AT2.quant <- quantile(juan.genes.gr.nw.dt$AT2)
AT2.quant.2 <- quantile(juan.genes.gr.nw.dt$AT2, probs = c(0.33, 0.67, 1))

#Define a data.frame starting from the expression information.
sikk_AT2 <- juan.genes.gr.nw.dt$AT2 %>% as.data.frame()
#Add quantile information and gene name.
colnames(sikk_AT2)[1] = 'AT2'
sikk_AT2$gene <- juan.genes.gr.nw.dt$gene_name
lowAT2.quant <- sikk_AT2[which(sikk_AT2$AT2 <= -6.886128), ]
midAT2.quant <- sikk_AT2[which(sikk_AT2$AT2 > -6.886128  & sikk_AT2$AT2 <= -5.963367) , ]
highAT2.quant <- sikk_AT2[which(sikk_AT2$AT2 > -5.963367) , ]
lowAT2.quant$Quart = 'Q1'
midAT2.quant$Quart = 'Q2'
highAT2.quant$Quart = 'Q3'
AT2.quart.mat <- rbind(lowAT2.quant,midAT2.quant,highAT2.quant)

#Make sure quantile data.frame gene order matches snv counts GRanges gene order. Remove NAs. Then add gene mutational density information for LUAD to the data.frame.
AT2lusc_centwrtTMB = AT2.quart.mat[match(juan.genes.gr.nw.dt$gene_name,AT2.quart.mat$gene),]
AT2lusc_centwrtTMB = AT2lusc_centwrtTMB[complete.cases(AT2lusc_centwrtTMB),]  # remove NAs
AT2.luscmat = cbind(AT2lusc_centwrtTMB,juan.genes.gr.nw.dt$densityLUSC)
colnames(AT2.luscmat)[[4]] = "densityLUSC"

#Compute mean and standard deviation values for mutational density per expression quantile.
AT2.luscmat.Q1 = AT2.luscmat[which(AT2.luscmat$Quart=="Q1"),]
AT2.luscmat.Q1mean = mean(AT2.luscmat.Q1$densityLUSC)
AT2.luscmat.Q1se = sd(AT2.luscmat.Q1$densityLUSC)/sqrt(length(AT2.luscmat.Q1$densityLUSC)) 
AT2.luscmat.Q2 = AT2.luscmat[which(AT2.luscmat$Quart=="Q2"),]
AT2.luscmat.Q2mean = mean(AT2.luscmat.Q2$densityLUSC)
AT2.luscmat.Q2se = sd(AT2.luscmat.Q2$densityLUSC)/sqrt(length(AT2.luscmat.Q2$densityLUSC))
AT2.luscmat.Q3 = AT2.luscmat[which(AT2.luscmat$Quart=="Q3"),]
AT2.luscmat.Q3mean = mean(AT2.luscmat.Q3$densityLUSC)
AT2.luscmat.Q3se = sd(AT2.luscmat.Q3$densityLUSC)/sqrt(length(AT2.luscmat.Q3$densityLUSC))
mean_AT2_mat = as.data.frame(rbind(AT2.luscmat.Q1mean,AT2.luscmat.Q2mean,AT2.luscmat.Q3mean))
#Save mean and sd values as a new data.frame.
se_AT2_mat = as.data.frame(rbind(AT2.luscmat.Q1se,AT2.luscmat.Q2se,AT2.luscmat.Q3se))
Q1.quart = "Q1"
Q2.quart = "Q2"
Q3.quart = "Q3"

#Combine data.frames with mean and standard deviation per expression quartile for AT2 and basal resting cells.
quart.mat = as.data.frame(rbind(Q1.quart,Q2.quart,Q3.quart))
AT2.mean.mat = cbind(quart.mat,mean_AT2_mat,se_AT2_mat)
rownames(AT2.mean.mat) = c(1,2,3)
colnames(AT2.mean.mat) = c("Quart","Mean_TMB","SE")
AT2.mean.mat$celltype = "AT2"
sup.mean.mat$celltype="Basalresting"

#Plot the graph.
mean.mat.lusc = rbind(AT2.mean.mat,sup.mean.mat)
mean.mat.lusc$celltype = as.factor(mean.mat.lusc$celltype)
p = ggplot(mean.mat.lusc, aes(x = Quart, y = Mean_TMB, group = celltype, color = celltype)) + 
  geom_point(size = 4) + 
  geom_line(size = 1.2) + 
  geom_errorbar(aes(ymin = Mean_TMB - SE, ymax = Mean_TMB + SE), width = 0.2, size = 0.9, color = 'black') +
  theme(axis.text = element_text(size = 20)) + 
  theme(axis.title = element_text(size = 20)) + 
  theme(
    panel.background = element_rect(fill = 'transparent'),
    plot.background = element_rect(fill = 'transparent', color = NA),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    legend.background = element_rect(fill = 'transparent'),
    legend.box.background = element_rect(fill = 'transparent')
  )

print(p)

Extended Data Figure 4B

#Load required libraries.
library(skitools)

#Load GRanges with centroid expression data.
juan.genes.gr.nw = readRDS(file.path(params$fileLoc,"genes_gr_LUSC.rds"))
juan.genes.gr.nw.dt = gr2dt(juan.genes.gr.nw)

#Define quantiles for partitioning the genes by basal resting expression.
sup.quant <- quantile(juan.genes.gr.nw.dt$Basal.resting)
sup.quant.2 <- quantile(juan.genes.gr.nw.dt$Basal.resting, probs = c(0.33, 0.67, 1))

#Define a data.frame starting from the expression information.
sikk_sup <- juan.genes.gr.nw.dt$Basal.resting %>% as.data.frame()
#Add quantile information and gene name.
colnames(sikk_sup)[1] = 'Goblet'
sikk_sup$gene <- juan.genes.gr.nw.dt$gene_name
lowsup.quant <- sikk_sup[which(sikk_sup$Goblet <= -6.888385), ]
midsup.quant <- sikk_sup[which(sikk_sup$Goblet > -6.888385  & sikk_sup$Goblet <= -6.101867) , ]
highsup.quant <- sikk_sup[which(sikk_sup$Goblet > -6.101867) , ]
lowsup.quant$Quart = 'Q1'
midsup.quant$Quart = 'Q2'
highsup.quant$Quart = 'Q3'
sup.quart.mat <- rbind(lowsup.quant,midsup.quant,highsup.quant)

#Load GRanges with snv counts information per gene for LUAD. Convert to mutational density.
mgenes= readRDS(file.path(params$fileLoc,"mgenes.rds"))
mgenes2 <- copy(mgenes)
juan.genes.gr.nw.dt$densityLUSC <- (10**6)*juan.genes.gr.nw.dt$snv.count/(juan.genes.gr.nw.dt$width*53)

#Make sure quantile data.frame gene order matches snv counts GRanges gene order. Remove NAs. Then add gene mutational density information for LUAD to the data.frame.
supLUSC_centwrtTMB = sup.quart.mat[match(juan.genes.gr.nw.dt$gene_name,sup.quart.mat$gene),]
supLUSC_centwrtTMB = supLUSC_centwrtTMB[complete.cases(supLUSC_centwrtTMB),]
sup.luscmat = cbind(supLUSC_centwrtTMB,juan.genes.gr.nw.dt$densityLUSC)
colnames(sup.luscmat)[[4]] = "densityLUSC"

#Compute mean and standard deviation values for mutational density per expression quantile.
sup.luscmat.Q1 = sup.luscmat[which(sup.luscmat$Quart=="Q1"),]
sup.luscmat.Q1mean = mean(sup.luscmat.Q1$densityLUSC)
sup.luscmat.Q1se = sd(sup.luscmat.Q1$densityLUSC)/sqrt(length(sup.luscmat.Q1$densityLUSC)) 
sup.luscmat.Q2 = sup.luscmat[which(sup.luscmat$Quart=="Q2"),]
sup.luscmat.Q2mean = mean(sup.luscmat.Q2$densityLUSC)
sup.luscmat.Q2se = sd(sup.luscmat.Q2$densityLUSC)/sqrt(length(sup.luscmat.Q2$densityLUSC))
sup.luscmat.Q3 = sup.luscmat[which(sup.luscmat$Quart=="Q3"),]
sup.luscmat.Q3mean = mean(sup.luscmat.Q3$densityLUSC)
sup.luscmat.Q3se = sd(sup.luscmat.Q3$densityLUSC)/sqrt(length(sup.luscmat.Q3$densityLUSC))
#Save mean and sd values as a new data.frame.
mean_sup_mat_lusc = as.data.frame(rbind(sup.luscmat.Q1mean,sup.luscmat.Q2mean,sup.luscmat.Q3mean))
se_sup_mat_lusc = as.data.frame(rbind(sup.luscmat.Q1se,sup.luscmat.Q2se,sup.luscmat.Q3se))
Q1.quart = "Q1"
Q2.quart = "Q2"
Q3.quart = "Q3"
quart.mat = as.data.frame(rbind(Q1.quart,Q2.quart,Q3.quart))
sup.mean.mat = cbind(quart.mat,mean_sup_mat_lusc,se_sup_mat_lusc)
rownames(sup.mean.mat) = c(1,2,3)
colnames(sup.mean.mat) = c("Quart","Mean_TMB","SE")

#Do the same process for AT2 expression. Define quantiles.
AT2.quant <- quantile(juan.genes.gr.nw.dt$AT2)
AT2.quant.2 <- quantile(juan.genes.gr.nw.dt$AT2, probs = c(0.33, 0.67, 1))

#Define a data.frame starting from the expression information.
sikk_AT2 <- juan.genes.gr.nw.dt$AT2 %>% as.data.frame()
#Add quantile information and gene name.
colnames(sikk_AT2)[1] = 'AT2'
sikk_AT2$gene <- juan.genes.gr.nw.dt$gene_name
lowAT2.quant <- sikk_AT2[which(sikk_AT2$AT2 <= -6.886128), ]
midAT2.quant <- sikk_AT2[which(sikk_AT2$AT2 > -6.886128  & sikk_AT2$AT2 <= -5.963367) , ]
highAT2.quant <- sikk_AT2[which(sikk_AT2$AT2 > -5.963367) , ]
lowAT2.quant$Quart = 'Q1'
midAT2.quant$Quart = 'Q2'
highAT2.quant$Quart = 'Q3'
AT2.quart.mat <- rbind(lowAT2.quant,midAT2.quant,highAT2.quant)

#Make sure quantile data.frame gene order matches snv counts GRanges gene order. Remove NAs. Then add gene mutational density information for LUAD to the data.frame.
AT2lusc_centwrtTMB = AT2.quart.mat[match(juan.genes.gr.nw.dt$gene_name,AT2.quart.mat$gene),]
AT2lusc_centwrtTMB = AT2lusc_centwrtTMB[complete.cases(AT2lusc_centwrtTMB),]
AT2.luscmat = cbind(AT2lusc_centwrtTMB,juan.genes.gr.nw.dt$densityLUSC)
colnames(AT2.luscmat)[[4]] = "densityLUSC"

#Compute mean and standard deviation values for mutational density per expression quantile.
AT2.luscmat.Q1 = AT2.luscmat[which(AT2.luscmat$Quart=="Q1"),]
AT2.luscmat.Q1mean = mean(AT2.luscmat.Q1$densityLUSC)
AT2.luscmat.Q1se = sd(AT2.luscmat.Q1$densityLUSC)/sqrt(length(AT2.luscmat.Q1$densityLUSC)) 
AT2.luscmat.Q2 = AT2.luscmat[which(AT2.luscmat$Quart=="Q2"),]
AT2.luscmat.Q2mean = mean(AT2.luscmat.Q2$densityLUSC)
AT2.luscmat.Q2se = sd(AT2.luscmat.Q2$densityLUSC)/sqrt(length(AT2.luscmat.Q2$densityLUSC))
AT2.luscmat.Q3 = AT2.luscmat[which(AT2.luscmat$Quart=="Q3"),]
AT2.luscmat.Q3mean = mean(AT2.luscmat.Q3$densityLUSC)
AT2.luscmat.Q3se = sd(AT2.luscmat.Q3$densityLUSC)/sqrt(length(AT2.luscmat.Q3$densityLUSC))
mean_AT2_mat = as.data.frame(rbind(AT2.luscmat.Q1mean,AT2.luscmat.Q2mean,AT2.luscmat.Q3mean))
#Save mean and sd values as a new data.frame.
se_AT2_mat = as.data.frame(rbind(AT2.luscmat.Q1se,AT2.luscmat.Q2se,AT2.luscmat.Q3se))
Q1.quart = "Q1"
Q2.quart = "Q2"
Q3.quart = "Q3"

#Combine data.frames with mean and standard deviation per expression quartile for AT2 and basal resting cells.
quart.mat = as.data.frame(rbind(Q1.quart,Q2.quart,Q3.quart))
AT2.mean.mat = cbind(quart.mat,mean_AT2_mat,se_AT2_mat)
rownames(AT2.mean.mat) = c(1,2,3)
colnames(AT2.mean.mat) = c("Quart","Mean_TMB","SE")
AT2.mean.mat$celltype = "AT2"
sup.mean.mat$celltype="Basalresting"

#Plot the graph.
mean.mat.lusc = rbind(AT2.mean.mat,sup.mean.mat)
mean.mat.lusc$celltype = as.factor(mean.mat.lusc$celltype)
p = ggplot(mean.mat.lusc, aes(x = Quart, y = Mean_TMB, group = celltype, color = celltype)) + 
  geom_point(size = 4) + 
  geom_line(size = 1.2) + 
  geom_errorbar(aes(ymin = Mean_TMB - SE, ymax = Mean_TMB + SE), width = 0.2, size = 0.9, color = 'black') +
  theme(axis.text = element_text(size = 20)) + 
  theme(axis.title = element_text(size = 20)) + 
  theme(
    panel.background = element_rect(fill = 'transparent'),
    plot.background = element_rect(fill = 'transparent', color = NA),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    legend.background = element_rect(fill = 'transparent'),
    legend.box.background = element_rect(fill = 'transparent')
  )

print(p)

Extended Data Figure 6

# Accuracy for predicting true cell type at different levels 

sims_acc_results <- fread(file.path(params$fileLoc,'edf6_luad_sims_results.csv'))

# accuracy at level finest
p1_5 <- ggplot(sims_acc_results[, .(celltype, accuracy_level_finest)] %>% as.data.frame(), aes(x = celltype, y = accuracy_level_finest)) +
  geom_point(size=2, shape=23, color = 'darkgreen') + 
  theme_classic() +
  theme( axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + ylim(0,1) + 
  ggtitle("Finest level match") +
  xlab("True cell type") + ylab("Accuracy")

# accuracy at level 4 resolution
p1_4 <- ggplot(sims_acc_results[, .(celltype, accuracy_level4)] %>% as.data.frame(), aes(x = celltype, y = accuracy_level4)) +
  geom_point(size=2, shape=23, color = 'darkgreen') + 
  theme_classic() +
  theme( axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))  + ylim(0,1) + 
  ggtitle("Level 4 match") +
  xlab("True cell type") + ylab("Accuracy")

# accuracy at level 3 resolution
p1_3 <- ggplot(sims_acc_results[, .(celltype, accuracy_level3)] %>% as.data.frame(), aes(x = celltype, y = accuracy_level3)) +
  geom_point(size=2, shape=23, color = 'darkgreen') + 
  theme_classic() +
  theme( axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))  + ylim(0,1) + 
  ggtitle("Level 3 match") +
  xlab("True cell type") + ylab("Accuracy")

# accuracy at level 2 resolution
p1_2 <- ggplot(sims_acc_results[, .(celltype, accuracy_level2)] %>% as.data.frame(), aes(x = celltype, y = accuracy_level2)) +
  geom_point(size=2, shape=23, color = 'darkgreen') + 
  theme_classic() +
  theme( axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))  + ylim(0,1) + 
  ggtitle("Level 2 match") +
  xlab("True cell type") + ylab("Accuracy")

# accuracy at level distal - non - distal resolution
p1_1 <- ggplot(sims_acc_results[, .(celltype, accuracy_leveldnd)] %>% as.data.frame(), aes(x = celltype, y = accuracy_leveldnd)) +
  geom_point(size=2, shape=23, color = 'darkgreen') + 
  theme_classic() +
  theme( axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))  + ylim(0,1) + 
  ggtitle("Distal vs non-distal match") +
  xlab("True cell type") + ylab("Accuracy")

p_acc <- ggpubr::ggarrange(p1_1, p1_2, p1_3, p1_4, p1_5, ncol = 3, nrow = 2)
print(p_acc)

Extended Data Figure 7

#Load libraries.
library(skitools)

#Plots can be made for univariate and multivariate simultaneously or individually. Comments are made when relevant to do one or the other.

#Load simulation result files. Add Method column to identify if results are uni or multivariate.
resDT = readRDS(file.path(params$fileLoc,"edf7_testRun_withOverlap_Cov.rds"))
resDT$Method = "Univariate"

#If results are plotted in the same graph, uncomment the following lines, adjusting the Method column value accordingly. 
#MulDT = readRDS(file.path(params$fileLoc,"edf7_testRun_withOverlapMultivariatewithCovs.rds"))
# MulDT$Method = "Multivariate"
#resDT = rbind(resDT,MulDT,fill=TRUE)

#Load csv with Sikkema annotations.
annotSikk = read.csv(file.path(params$fileLoc,"edf7_sikk_meta_epi3.csv"))
#Add proximal and distal annotations.
annotSikk$ann_level_PD = ifelse(grepl("^AT",annotSikk$ann_finest_level),"Proximal","Distal")

#Pick a level (i.e.: column).
#Check the overlapping cell calls. Are all the overlapping cells from the same level? If not just put ambiguous. If yes use the level label.
resDT$Overlap_Level = ""
resDT$True_Level = ""
#Define a column to determine if sim had the right call (1) or not (0).
resDT$Lev_Hit = 0

#For the true cell just add the corresponding level value.
#Check if the 2 correspond. If yes add 1 to a new level accuracy column. If not just add a 0.  
for(i in 1:nrow(resDT)){
#    if(i%%100==0){
#        print(i)
#     }
    resDT$True_Level[i] = annotSikk[annotSikk$ann_finest_level==resDT$thisCell[i],]$ann_level_PD
    calledLevels = unique(annotSikk[annotSikk$ann_finest_level %in% strsplit(resDT$allOverlap[i],",")[[1]],]$ann_level_PD)
    if(length(calledLevels)>1){
        resDT$Overlap_Level[i] = "Ambiguous"
     }else if(length(calledLevels)==0){
         resDT$Overlap_Level[i] = "No call"
    }else{
         resDT$Overlap_Level[i] = calledLevels
     }
    resDT$Lev_Hit[i] = ifelse(resDT$Overlap_Level[i] == resDT$True_Level[i],1,0)
}

#Get average accuracy per TMB using the Lev_Hit column (accuracy = number of 1s in said column vs number of results for that TMB).
resDT$Lev_Hit = as.numeric(resDT$Lev_Hit)

#Results are grouped eithet by Method and TMB or only TMB depending on the desired graph.
#average_df = resDT %>% group_by(Method,TMB) %>% summarise(average_value = mean(Lev_Hit, na.rm = TRUE))
average_df = resDT %>% group_by(TMB) %>% dplyr::summarise(average_value = mean(Lev_Hit, na.rm = TRUE))  
average_df = as.data.frame(average_df)  

#Line graph is plotted. Depending on whether both uni and multivariate are plotted use the first line or the second (comment/uncomment adequately).
p =  ggplot(data=average_df, aes(x=log10(TMB), y=average_value)) + geom_line(color="blue")+ geom_point() + ggtitle("Accuracy per TMB Multivariate - Proximal vs Distal" ) + xlab("Log10(TMB) (mut/mb)") + ylab("Accuracy") + scale_y_continuous(limits=c(0,1)) +   theme(text = element_text(size = 20)) 
#p =  ggplot(data=average_df, aes(x=log10(TMB), y=average_value, color = Method, group=Method)) + geom_line(color="red")+ geom_point(aes(shape=Method),size=2) + ggtitle("Accuracy per TMB - Celltype" ) + xlab("Log10(TMB) (mut/mb)") + ylab("Accuracy") + scale_y_continuous(limits=c(0,1)) +   theme(text = element_text(size = 20)) +  scale_color_manual(values = c("Univariate" = "red", "Multivariate" = "blue")) 
print(p)

Extended Data Figure 8

Extended Data Figure 8A

# Heat map for LUSC 

nsclc_col <- readRDS(file.path(params$fileLoc,'nsclc_hm_col.rds'))
hm_lusc <- fread(file.path(params$fileLoc,'hm_lusc.csv'))
hm_lusc <- hm_lusc %>% as.data.frame()
column_ha2 = HeatmapAnnotation(
  sbs4 = anno_barplot(hm_lusc[,25], gp = gpar(col = "red", fill = "#FF0000")),
  sbs1 = anno_barplot( hm_lusc[,26], gp = gpar(col = "green", fill = "#99FF33")) ,
  sbs5 = anno_barplot(hm_lusc[,27], gp = gpar(col = "green", fill = "#99FF33")) ,
  sbs2 = anno_barplot(hm_lusc[,28], gp = gpar(col = "orange", fill = "#FF8000")) ,
  sbs13 = anno_barplot(hm_lusc[,29], gp = gpar(col = "orange", fill = "#FF8000")) ,
  id1 = anno_barplot(hm_lusc[,31], gp = gpar(col = "green", fill = "#99FF33")) ,
  id2 = anno_barplot(hm_lusc[,32], gp = gpar(col = "violet", fill = "#9900CC")) ,
  id3 = anno_barplot(hm_lusc[,33], gp = gpar(col = "red", fill = "#FF0000")) ,
  id12 = anno_barplot(hm_lusc[,34], gp = gpar(col = "violet", fill = "#9900CC")) ,
  smoker = hm_lusc[,35],
  col = list(
    smoker = c("Smoker" = "orange", "Never Smoker" = "blue") 
  )
)

set.seed(90210)
Heatmap(t(hm_lusc[,1:23]), name = "Relative Risk", col = nsclc_col, cluster_rows = TRUE, cluster_columns = TRUE, row_names_gp = gpar(fontsize = 15), column_names_gp = gpar(fontsize = 10), 
            column_names_side = c("bottom"), show_column_names = FALSE, column_km = 4, column_km_repeats = 100, top_annotation = column_ha2,
            show_parent_dend_line = FALSE, column_gap = unit(c(4), "mm"), column_title = NULL, border = TRUE)

Extended Data Figure 8B

# LUSC - Cluster 1 Distal Lung
edf_8_data <- readRDS(file.path(params$fileLoc,'edf8.rds'))
ggplot(edf_8_data[cluster == 'Distal'], aes(x = reorder(celltype,estimate), y = estimate, fill = Cell_Class)) +
  geom_bar(stat = "identity", alpha = 0.9) +
  scale_fill_manual(values = c("grey")) +
  geom_errorbar(aes(ymax = ci.upper, ymin = ci.lower), size = 0.65, width = 0.25, color = 'black') +
  scale_y_continuous(breaks = seq(in.axis.min-0.05, in.axis.max+0.05, in.axis.breaks), labels = function(y) y + 1, limits = c(-0.25, 0.35)) + # LUAD
  scale_fill_manual("legend", values = c("Proximal" = "olivedrab3", "Distal" = "darkgoldenrod3")) +
  theme_bw() +
  theme(panel.grid.minor = element_blank()) +
  xlab("") +
  ylab("Relative Risk") +
  ggtitle(paste0('LUSC Cluster 1')) +
  theme(axis.text.y = element_text(size = 10, angle = 0, vjust = 0.5, hjust = 1)) + guides(fill=guide_legend(title="Cell types")) +
  coord_flip()


# LUSC - Cluster 2 Proximal 1
ggplot(edf_8_data[cluster == 'Proximal_1'], aes(x = reorder(celltype,estimate), y = estimate, fill = Cell_Class)) +
  geom_bar(stat = "identity", alpha = 0.9) +
  scale_fill_manual(values = c("grey")) +
  geom_errorbar(aes(ymax = ci.upper, ymin = ci.lower), size = 0.65, width = 0.25, color = 'black') +
  scale_y_continuous(breaks = seq(in.axis.min-0.05, in.axis.max+0.05, in.axis.breaks), labels = function(y) y + 1, limits = c(-0.25, 0.35)) + # LUAD
  scale_fill_manual("legend", values = c("Proximal" = "olivedrab3", "Distal" = "darkgoldenrod3")) +
  theme_bw() +
  theme(panel.grid.minor = element_blank()) +
  xlab("") +
  ylab("Relative Risk") +
  ggtitle(paste0('LUSC Cluster 2')) +
  theme(axis.text.y = element_text(size = 10, angle = 0, vjust = 0.5, hjust = 1)) + guides(fill=guide_legend(title="Cell types")) +
  coord_flip()


# LUSC - Cluster 3  Proximal_2
ggplot(edf_8_data[cluster == 'Proximal_2'], aes(x = reorder(celltype,estimate), y = estimate, fill = Cell_Class)) +
  geom_bar(stat = "identity", alpha = 0.9) +
  scale_fill_manual(values = c("grey")) +
  geom_errorbar(aes(ymax = ci.upper, ymin = ci.lower), size = 0.65, width = 0.25, color = 'black') +
  scale_y_continuous(breaks = seq(in.axis.min-0.05, in.axis.max+0.05, in.axis.breaks), labels = function(y) y + 1, limits = c(-0.25, 0.35)) + # LUAD
  scale_fill_manual("legend", values = c("Proximal" = "olivedrab3", "Distal" = "darkgoldenrod3")) +
  theme_bw() +
  theme(panel.grid.minor = element_blank()) +
  xlab("") +
  ylab("Relative Risk") +
  ggtitle(paste0('LUSC Cluster 3')) +
  theme(axis.text.y = element_text(size = 10, angle = 0, vjust = 0.5, hjust = 1)) + guides(fill=guide_legend(title="Cell types")) +
  coord_flip()

Extended Data Figure 9

Extended Data Figure 9A

pts_coo_id_edf9a <- readRDS(file.path(params$fileLoc,'pts_coo_id_4B.rds'))
pts_coo_id_edf9a$Lineage_plasticity <- ''
pts_coo_id_edf9a[Identity == 'Distal Lung' & Origin == 'Distal Lung' ]$Lineage_plasticity <- 'Lineage conserved'
pts_coo_id_edf9a[Identity == 'Proximal Lung' & Origin == 'Distal Lung' ]$Lineage_plasticity <- 'Lineage plasticity'
pts_coo_id_edf9a[Identity == 'Distal Lung' & Origin == 'Non-Distal Lung' ]$Lineage_plasticity <- 'Lineage plasticity'
pts_coo_id_edf9a[Identity == 'Proximal Lung' & Origin == 'Non-Distal Lung' ]$Lineage_plasticity <- 'Lineage conserved'

res.plot =  pts_coo_id_edf9a[, prop.test(sum(Lineage_plasticity == 'Lineage plasticity'), .N) %>% dflm %>% cbind(nprox = sum(Lineage_plasticity == 'Lineage plasticity'), tot = .N), by = .(TP53_mut = ifelse(TP53_mut, 'TP53 MUT', 'WT'), Origin)][, fracprox := estimate]
res.plot$Origin = factor(res.plot$Origin, levels = c('Non-Distal Lung','Distal Lung'))
res.plot$TP53_mut = factor(res.plot$TP53_mut, levels = c('TP53 MUT','WT'))
ggplot(res.plot, aes(x = TP53_mut, y = fracprox, fill = TP53_mut)) +
  geom_bar(stat = 'identity', position = position_dodge()) +
  geom_errorbar(aes(ymin = ci.lower, ymax = ci.upper), width = 0.15) + theme_bw() + 
  facet_grid(~Origin) +
  scale_fill_manual(values = c(wes_palettes$Royal1[2], wes_palettes$Royal1[1])) +
  labs(title = '', x = '', y = '') + theme_classic() +
  theme(plot.title = element_text(size = 0, face = 'bold'),
        axis.text.x = element_text(size = 0, angle = 0, hjust = 0.5),
        axis.text.y = element_text(size = 22, angle = 0, hjust = 1),
        axis.title.x = element_text(size = 0, face = 'plain'),
        axis.title.y = element_text(size = 5, face = 'bold'),
        axis.ticks.x = element_blank()) + 
  geom_text(mapping = aes(x = TP53_mut, y = ci.upper + 0.05, label = paste0(nprox, '/', tot)), size = 7) + 
  guides(fill = guide_legend(title = 'Fig 4C - Lineage plasticity fraction')) + theme(legend.position = "bottom")

Extended Data Figure 9B

library(ggsankey)
tmp.plot.luad.df3 <- readRDS(file.path(params$fileLoc,'edf9b.rds'))

tmp.plot.luad.pl <- ggplot(tmp.plot.luad.df3, aes(x = x,                        
                                                  next_x = next_x,                                     
                                                  node = node,
                                                  next_node = next_node,        
                                                  fill = factor(node),
                                                  label = paste0(node, " = ", n)))             # This Creates a label for each node

tmp.plot.luad.pl <- tmp.plot.luad.pl +geom_sankey(flow.alpha = 0.5,          #This Creates the transparency of your node 
                                                  node.color = "black",     # This is your node color        
                                                  show.legend = TRUE)        # This determines if you want your legend to show

tmp.plot.luad.pl <- tmp.plot.luad.pl + geom_sankey_label(Size = 3,
                                                         color = "black", 
                                                         fill = "white") # This specifies the Label format for each node 
tmp.plot.luad.pl <- tmp.plot.luad.pl + theme_bw()
tmp.plot.luad.pl <- tmp.plot.luad.pl + theme(legend.position = 'none')
tmp.plot.luad.pl <- tmp.plot.luad.pl + theme(axis.title = element_blank(),
                                             axis.text.y = element_blank(),
                                             axis.ticks = element_blank(),
                                             panel.grid = element_blank())
tmp.plot.luad.pl <- tmp.plot.luad.pl + labs(title = "Origin - Identity - Histology")
tmp.plot.luad.pl <- tmp.plot.luad.pl + labs(subtitle = "LUAD")
tmp.plot.luad.pl <- tmp.plot.luad.pl + labs(fill = 'Nodes')
tmp.plot.luad.pl

Extended Data Figure 9C

edf9 <- readRDS(file.path(params$fileLoc,'edf9.rds'))

ik <-  'Papillary'


freq.plot.tmp.f = edf9

freq.plot.tmp.f$clust.int = freq.plot.tmp.f$T_Identity
freq.plot.tmp.f = freq.plot.tmp.f[!duplicated(freq.plot.tmp.f),]
col.num = which(colnames(freq.plot.tmp.f) == ik)
freq.plot.tmp.f$mut.int = freq.plot.tmp.f[,..col.num]
freq.plot.tmp.f = freq.plot.tmp.f[!is.na(mut.int),]
freq.plot.tmp.f$mut.int = as.numeric(freq.plot.tmp.f$mut.int)
res.plot = freq.plot.tmp.f[, prop.test(sum(mut.int), .N) %>% dflm %>% cbind(nmut = sum(mut.int), tot = .N), by = .(clust.int)][, fracmut := estimate]
res.plot$clust.int = factor(res.plot$clust.int, levels = c('Non-distal identity','Distal identity'))
non.dist.mut = res.plot[clust.int == 'Non-distal identity',]$nmut
non.dist.wt = res.plot[clust.int == 'Non-distal identity',]$tot - res.plot[clust.int == 'Non-distal identity',]$nmut
dist.mut = res.plot[clust.int == 'Distal identity',]$nmut
dist.wt = res.plot[clust.int == 'Distal identity',]$tot - res.plot[clust.int == 'Distal identity',]$nmut

prox.dist.fisher = matrix(c(non.dist.mut, dist.mut, non.dist.wt, dist.wt), nrow = 2, byrow = TRUE) %>% fisher.test %>% dflm


ggplot(res.plot, aes(x = clust.int, y = fracmut, fill = clust.int)) +
  geom_bar(stat = 'identity', position = position_dodge()) +
  geom_errorbar(aes(ymin = ci.lower, ymax = ci.upper), width = 0.15) + theme_bw() + 
  scale_fill_manual(values = c(  'forestgreen', 'darkgoldenrod3')) +
  labs(title = paste0('Papillary fraction of LUAD identity'), x = '', y = '') + theme_classic() +
  theme(plot.title = element_text(size = 20, face = 'bold'),
        axis.text.x = element_text(size = 0, angle = 0, hjust = 0.5),
        axis.text.y = element_text(size = 22, angle = 0, hjust = 1),
        axis.title.x = element_text(size = 0, face = 'plain'),
        axis.title.y = element_text(size = 5, face = 'bold'),
        axis.ticks.x = element_blank()) + 
  geom_text(mapping = aes(x = clust.int, y = ci.upper + 0.05, label = paste0(nmut, '/', tot)), size = 7) +
  guides(fill = guide_legend(title = 'Identity')) + theme(legend.position = "top")

Extended Data Figure 9D

# by Identity 

edf9 <- readRDS(file.path(params$fileLoc,'edf9.rds'))

ik <-  'NSCLC_NOS'

freq.plot.tmp.f = edf9

freq.plot.tmp.f$clust.int = freq.plot.tmp.f$T_Identity
freq.plot.tmp.f = freq.plot.tmp.f[!duplicated(freq.plot.tmp.f),]
col.num = which(colnames(freq.plot.tmp.f) == ik)
freq.plot.tmp.f$mut.int = freq.plot.tmp.f[,..col.num]
freq.plot.tmp.f = freq.plot.tmp.f[!is.na(mut.int),]
freq.plot.tmp.f$mut.int = as.numeric(freq.plot.tmp.f$mut.int)
res.plot = freq.plot.tmp.f[, prop.test(sum(mut.int), .N) %>% dflm %>% cbind(nmut = sum(mut.int), tot = .N), by = .(clust.int)][, fracmut := estimate]
res.plot$clust.int = factor(res.plot$clust.int, levels = c('Non-distal identity','Distal identity'))
non.dist.mut = res.plot[clust.int == 'Non-distal identity',]$nmut
non.dist.wt = res.plot[clust.int == 'Non-distal identity',]$tot - res.plot[clust.int == 'Non-distal identity',]$nmut
dist.mut = res.plot[clust.int == 'Distal identity',]$nmut
dist.wt = res.plot[clust.int == 'Distal identity',]$tot - res.plot[clust.int == 'Distal identity',]$nmut

prox.dist.fisher = matrix(c(non.dist.mut, dist.mut, non.dist.wt, dist.wt), nrow = 2, byrow = TRUE) %>% fisher.test %>% dflm


ggplot(res.plot, aes(x = clust.int, y = fracmut, fill = clust.int)) +
  geom_bar(stat = 'identity', position = position_dodge()) +
  geom_errorbar(aes(ymin = ci.lower, ymax = ci.upper), width = 0.15) + theme_bw() + 
  scale_fill_manual(values = c(  'forestgreen', 'darkgoldenrod3', '#bdbdbd')) +
  labs(title = paste0('NSCLC-NOS fraction of LUAD identity'), x = '', y = '') + theme_classic() +
  theme(plot.title = element_text(size = 20, face = 'bold'),
        axis.text.x = element_text(size = 0, angle = 0, hjust = 0.5),
        axis.text.y = element_text(size = 22, angle = 0, hjust = 1),
        axis.title.x = element_text(size = 0, face = 'plain'),
        axis.title.y = element_text(size = 5, face = 'bold'),
        axis.ticks.x = element_blank()) + 
  geom_text(mapping = aes(x = clust.int, y = ci.upper + 0.05, label = paste0(nmut, '/', tot)), size = 7) +
  guides(fill = guide_legend(title = 'Identity')) + theme(legend.position = "top")

Extended Data Figure 9E

# NSCLC-NOS vs TP53

edf9 <- readRDS(file.path(params$fileLoc,'edf9.rds'))

ik <-  'NSCLC_NOS'

freq.plot.tmp.f = edf9

freq.plot.tmp.f$clust.int = freq.plot.tmp.f$TP53_mut
freq.plot.tmp.f = freq.plot.tmp.f[!duplicated(freq.plot.tmp.f),]
col.num = which(colnames(freq.plot.tmp.f) == ik)
freq.plot.tmp.f$mut.int = freq.plot.tmp.f[,..col.num]
freq.plot.tmp.f = freq.plot.tmp.f[!is.na(mut.int),]
freq.plot.tmp.f$mut.int = as.numeric(freq.plot.tmp.f$mut.int)
res.plot = freq.plot.tmp.f[, prop.test(sum(mut.int), .N) %>% dflm %>% cbind(nmut = sum(mut.int), tot = .N), by = .(clust.int)][, fracmut := estimate]
res.plot$clust.int = factor(res.plot$clust.int, levels = c('TP53 MUT','WT'))
non.dist.mut = res.plot[clust.int == 'TP53 MUT',]$nmut
non.dist.wt = res.plot[clust.int == 'TP53 MUT',]$tot - res.plot[clust.int == 'TP53 MUT',]$nmut
dist.mut = res.plot[clust.int == 'WT',]$nmut
dist.wt = res.plot[clust.int == 'WT',]$tot - res.plot[clust.int == 'WT',]$nmut

prox.dist.fisher = matrix(c(non.dist.mut, dist.mut, non.dist.wt, dist.wt), nrow = 2, byrow = TRUE) %>% fisher.test %>% dflm


ggplot(res.plot, aes(x = clust.int, y = fracmut, fill = clust.int)) +
  geom_bar(stat = 'identity', position = position_dodge()) +
  geom_errorbar(aes(ymin = ci.lower, ymax = ci.upper), width = 0.15) + theme_bw() + 
  scale_fill_manual(values = c(  'salmon3', 'snow4')) +
  labs(title = paste0('NSCLC-NOS histology fraction'), x = '', y = '') + theme_classic() +
  theme(plot.title = element_text(size = 20, face = 'bold'),
        axis.text.x = element_text(size = 0, angle = 0, hjust = 0.5),
        axis.text.y = element_text(size = 22, angle = 0, hjust = 1),
        axis.title.x = element_text(size = 0, face = 'plain'),
        axis.title.y = element_text(size = 5, face = 'bold'),
        axis.ticks.x = element_blank()) + 
  geom_text(mapping = aes(x = clust.int, y = ci.upper + 0.05, label = paste0(nmut, '/', tot)), size = 7) +
  guides(fill = guide_legend(title = 'TP53 status')) + theme(legend.position = "top")

Extended Data Figure 9F

edf9 <- readRDS(file.path(params$fileLoc,'edf9.rds'))

ik = 'mut_tp53_hist'
freq.plot.tmp.f = edf9

freq.plot.tmp.f$clust.int = freq.plot.tmp.f$NSCLC_NOS_VS_Others
freq.plot.tmp.f = freq.plot.tmp.f[!duplicated(freq.plot.tmp.f),]
col.num = which(colnames(freq.plot.tmp.f) == ik)
freq.plot.tmp.f$mut.int = freq.plot.tmp.f[,..col.num]
freq.plot.tmp.f = freq.plot.tmp.f[!is.na(mut.int),]
freq.plot.tmp.f$mut.int = as.numeric(freq.plot.tmp.f$mut.int)
res.plot = freq.plot.tmp.f[, prop.test(sum(mut.int), .N) %>% dflm %>% cbind(nmut = sum(mut.int), tot = .N), by = .(clust.int)][, fracmut := estimate]
res.plot$clust.int = factor(res.plot$clust.int, levels = c('NSCLC_NOS','Non-NSCLC_NOS'))
nsclc.nos.mut = res.plot[clust.int == 'NSCLC_NOS',]$nmut
nsclc.nos.wt = res.plot[clust.int == 'NSCLC_NOS',]$tot - res.plot[clust.int == 'NSCLC_NOS',]$nmut
others.mut = res.plot[clust.int == 'Non-NSCLC_NOS',]$nmut
others.wt = res.plot[clust.int == 'Non-NSCLC_NOS',]$tot - res.plot[clust.int == 'Non-NSCLC_NOS',]$nmut

nsclc.nos.others.fisher = matrix(c(nsclc.nos.mut, others.mut, nsclc.nos.wt, others.wt), nrow = 2, byrow = TRUE) %>% fisher.test %>% dflm

ggplot(res.plot, aes(x = clust.int, y = fracmut, fill = clust.int)) +
  geom_bar(stat = 'identity', position = position_dodge()) +
  geom_errorbar(aes(ymin = ci.lower, ymax = ci.upper), width = 0.15) + theme_bw() + 
  scale_fill_manual(values = c(  'orchid3',   'snow4')) +
  labs(title = paste0('Fraction of samples with TP53 mut'), x = '', y = '') + theme_classic() +
  theme(plot.title = element_text(size = 20, face = 'bold'),
        axis.text.x = element_text(size = 0, angle = 0, hjust = 0.5),
        axis.text.y = element_text(size = 22, angle = 0, hjust = 1),
        axis.title.x = element_text(size = 0, face = 'plain'),
        axis.title.y = element_text(size = 5, face = 'bold'),
        axis.ticks.x = element_blank()) + 
  geom_text(mapping = aes(x = clust.int, y = ci.upper + 0.05, label = paste0(nmut, '/', tot)), size = 7) + ylim(0,1.05) +
  guides(fill = guide_legend(title = 'Histology')) + theme(legend.position = "top")

Extended Data Figure 9G

library(skitools)

edf9g_a <- readRDS(file.path(params$fileLoc,'edf9g_a.rds'))
edf9g_b <- readRDS(file.path(params$fileLoc,'edf9g_b.rds'))

oncoprint(edf9g_b, oncotab = edf9g_a,  genes = c('NKX2-1','SMARCA4','STK11','APC','KEAP1','ALK','MCL1','MAP2K1','EGFR','FGFR1','FOXP1','CDK6','BCL2L1', 'TP53','RB1','CDKN2A','TERT','MYC','ARID1A','PTEN','CCND1','PIK3CA','ERBB2','CCNE1','NF1'), sort.genes = FALSE, colnames.fontsize = 20, 
          rownames.fontsize = 15, signature.thresh = 20, track.height = 1.5, split.gap = 0.5, wes = TRUE, track.gap = track.height/2, drop = TRUE, drop.genes = TRUE, svevents = FALSE,  sort.tumors = FALSE,
          return.oncotab = FALSE, height = 12, width = 25,  filename = 'EDF9g_onco.pdf')
rendering to /gpfs/commons/projects/imielinski_web/jandrademartinez/EDF9g_onco.pdf

Extended Data Figure 10

Extended Data Figure 10A

library(skitools)
tmp.plot = readRDS(file.path(params$fileLoc,"edf10.rds"))

highlight_color <- "darkslategray3"  # Color for WCM-1
grey_color <- "grey"        # Color for other patients

tmp.plot$color <- ifelse(tmp.plot$patient == "WCM-1", highlight_color, grey_color)

p <- ggplot(tmp.plot, aes(x, y, colour = color), size = 3) + 
  geom_point() + 
  theme_bw() + 
  theme(legend.position = "top") +
  scale_colour_identity()
print(p)

## Extended Data Figure 10B

library(skitools)
tmp.plot = readRDS(file.path(params$fileLoc,"edf10.rds"))
library(RColorBrewer)

my_colors <- colorRampPalette(brewer.pal(12, "Set3"))(length(unique(tmp.plot$cluster)))
my_colors[1] <- 'turquoise4'
my_colors[2] <- 'mediumseagreen'
my_colors[3] <- 'darkred'
names(my_colors) <- unique(tmp.plot$cluster)
tmp.plot$color <- ifelse(tmp.plot$patient == "WCM-1", my_colors[tmp.plot$cluster], "grey")
p <- ggplot(tmp.plot, aes(x = x, y = y, colour = factor(cluster)), size = 3) + 
  geom_point(aes(color = ifelse(patient == "WCM-1", as.character(cluster), "Other"))) + 
  scale_color_manual(values = c(my_colors, Other = "grey"), 
                     breaks = c(names(my_colors), "Other"),
                     labels = c(names(my_colors), "Other Patients")) +
  theme_bw() + 
  theme(legend.position = "top") 
print(p)